From 28105bc50ee4f0935fc1a32256584335548a15f2 Mon Sep 17 00:00:00 2001 From: gtema Date: Mon, 5 Jun 2023 06:34:45 +0000 Subject: [PATCH] chore: use convertor from separate project Reviewed-by: Hasko, Vladimir Co-authored-by: gtema Co-committed-by: gtema --- otc_doc_convertor/__init__.py | 0 otc_doc_convertor/comparator.py | 95 --- otc_doc_convertor/convertor.py | 777 ------------------ otc_doc_convertor/tests/unit/__init__.py | 0 .../tests/unit/test_convertor.py | 249 ------ playbooks/pre.yaml | 4 +- setup.cfg | 15 +- templates/conf.py | 2 +- zuul.yaml | 35 +- 9 files changed, 8 insertions(+), 1169 deletions(-) delete mode 100644 otc_doc_convertor/__init__.py delete mode 100644 otc_doc_convertor/comparator.py delete mode 100644 otc_doc_convertor/convertor.py delete mode 100644 otc_doc_convertor/tests/unit/__init__.py delete mode 100644 otc_doc_convertor/tests/unit/test_convertor.py diff --git a/otc_doc_convertor/__init__.py b/otc_doc_convertor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/otc_doc_convertor/comparator.py b/otc_doc_convertor/comparator.py deleted file mode 100644 index 69014754..00000000 --- a/otc_doc_convertor/comparator.py +++ /dev/null @@ -1,95 +0,0 @@ -import argparse -import logging -import requests -import pathlib -from bs4 import BeautifulSoup - - -def body_filter(tag): - return ( - tag.name == "div" - and tag.has_attr("id") - and tag["id"].startswith("body") - ) - - -def simplify_body(data): - return data.get_text().replace(" ", "") - - -class OTCComparator: - - def compare(self, url_prefix, file_path, file_name): - try: - data = requests.get( - f"https://docs.otc.t-systems.com/{url_prefix}/" - f"{file_name}.json") - page_data = None - for item in data.json(): - if ( - item.get("url").endswith(f"{file_name}.html") - and item['content'] - ): - page_data = item["content"] - break - original = BeautifulSoup(page_data, 'html.parser') - with open(f"{file_path}/{file_name}.html", "r") as f: - new_content = f.read() - new = BeautifulSoup(new_content, 'html.parser') - t1 = original.find(body_filter) - t2 = new.find(body_filter) - if t1 != t2: - if simplify_body(t1) == simplify_body(t2): - logging.error( - "File %s is not matching, but " - "plain text matches" % file_name) - return True - else: - logging.error("File %s mismatches" % file_name) - logging.debug( - "Proposed content: %s" % - t2.get_text().encode("unicode_escape").decode("utf-8")) - logging.debug( - "Current content: %s" % - t1.get_text().encode("unicode_escape").decode("utf-8")) - return False - else: - logging.info("Content matches") - return True - except Exception as ex: - logging.error("Content comparison error %s" % ex) - return False - - def main(self): - logging.basicConfig(level=logging.DEBUG) - parser = argparse.ArgumentParser(description="Compare document data.") - parser.add_argument( - "path", - type=str, - help="Path to the document content (i.e. docs/ecs/api-ref") - parser.add_argument( - "url", - type=str, - help="url prefix in the helpcenter (i.e. api/ecs)") - args = parser.parse_args() - match = True - - for f in pathlib.Path(args.path).glob("*.html"): - logging.info(f"Comparing {f.name}") - if not self.compare( - args.url, args.path, f.name.replace(".html", "")): - match = False - - if not match: - logging.error("Comparison showed deviations") - exit(1) - else: - logging.info("No deviations found") - - -def main(): - OTCComparator().main() - - -if __name__ == '__main__': - main() diff --git a/otc_doc_convertor/convertor.py b/otc_doc_convertor/convertor.py deleted file mode 100644 index 305c0c41..00000000 --- a/otc_doc_convertor/convertor.py +++ /dev/null @@ -1,777 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import bs4 -import json -import logging -import os -import pathlib -import re -import shutil - -from jinja2 import FileSystemLoader, Environment, select_autoescape - - -class OTCDocConvertor: - def __init__(self): - self.doc_anchors = dict() - self.doc_links = dict() - - @staticmethod - def get_new_name(current_name): - new_name = current_name.replace(" - ", "_") - # This is a unicode char - new_name = new_name.replace("–", "_") - new_name = new_name.replace(" ", "_") - new_name = new_name.replace("/", "_") - new_name = new_name.replace("=", "_") - new_name = new_name.replace("+", "") - new_name = new_name.replace("'", "") - new_name = new_name.replace('"', "") - new_name = new_name.replace("`", "") - new_name = new_name.replace("´", "") - new_name = new_name.replace(":", "") - new_name = new_name.replace("?", "") - new_name = new_name.replace("(", "") - new_name = new_name.replace(")", "") - new_name = new_name.replace(",", "") - new_name = new_name.replace("!", "") - new_name = new_name.replace("<", "") - new_name = new_name.replace(">", "") - new_name = new_name.replace("$", "") - new_name = new_name.replace("#", "sharp") - new_name = new_name.replace("%", "pct") - new_name = new_name.replace('_&_', '_and_') - new_name = re.sub(r'(\w+)&(\w+)', r'\1_and_\2', new_name) - new_name = re.sub('(_+)', '_', new_name) - new_name = new_name.lower() - return new_name - - @staticmethod - def build_doc_tree(metadata): - flat_tree = dict() - for k, v in metadata.items(): - parent_id = v.get("p_code") - if not parent_id: - parent_id = 0 - - if parent_id not in flat_tree: - flat_tree[parent_id] = list() - flat_tree[parent_id].append(v) - return flat_tree - - @classmethod - def get_target_path(cls, code, metadata): - if code in metadata: - current = metadata[code] - if not current.get("p_code"): - return current["new_name"] - else: - return "{0}/{1}".format( - cls.get_target_path(current["p_code"], metadata), - current["new_name"], - ) - else: - return "" - - def make_label(self, soup, name): - label = soup.new_tag("p") - label.string = f"..\\_{name.lower()}:" - return label - - def is_element_referred(self, ref, fname): - return ( - ref in self.doc_links - or "#" + ref in self.doc_links - or fname.lower() + "#" + ref in self.doc_links - ) - - def rawize_me(self, soup, expressions): - for to_rawize in expressions: - for p in soup.body.find_all(string=re.compile(to_rawize)): - if p.string and p.parent.name not in [ - "b", "strong", "pre", "code"]: - curr = p.string - part = re.search(to_rawize, curr) - # We should not escape inside of bold - this is wrong - if len(part.groups()) > 0: - logging.debug( - "Found element to rawize %s", part.group(1) - ) - new = curr.replace( - part.group(1), f"{part.group(1)}" - ) - logging.debug("Replacing string with: %s", new) - p.replace_with(bs4.BeautifulSoup(new, "html.parser")) - logging.debug("Replacing string with: %s", p.string) - else: - logging.error( - "Cannot find string for rawization anymore" - ) - - def streamline_html(self, soup, file_name, args=None): - # Drop eventual header duplicated anchors - fname = file_name.replace(".html", "").lower() - page_anchors = set() - met_page_anchors = dict() - for lnk in soup.body.find_all("a"): - name = None - if "name" in lnk.attrs and lnk.string is None: - name = lnk.attrs["name"].lower() - if name in met_page_anchors: - # Such anchor already existed on this page, drop it - lnk.decompose() - met_page_anchors[name] = True - - if name and name.lower() == fname: - lnk.decompose() - - # Process divs - for i in soup.body.find_all("div"): - if i.decomposed: - # if we decompose a later in the code it may still be returned - # here in the list. Skip those - continue - if "note" in i.get("class", []): - # Notes - del i["id"] - if i.img: - i.img.decompose() - notetitle = i.find("span", class_="notetitle") - notebody = i.find(class_="notebody") - if not (notebody and notebody.get_text()): - # Some smart people make empty notes. Since this is - # breaking layout we need to drop those - i.decompose() - elif notetitle: - title = soup.new_tag("div") - title["class"] = "title" - title.string = "Note:" - notetitle.replace_with(title) - elif "warning" in i.get("class", []): - # Warnings - del i["id"] - if i.img: - i.img.decompose() - eltitle = i.find("span", class_="warningtitle") - if eltitle: - title = soup.new_tag("div") - title["class"] = "title" - title.string = "Warning:" - eltitle.replace_with(title) - elif "notice" in i.get("class", []): - # Notices - del i["id"] - if i.img: - i.img.decompose() - i["class"] = "important" - elif "caution" in i.get("class", []): - # Cautions - del i["id"] - if i.img: - i.img.decompose() - elif "fignone" in i.get("class", []): - # Figures - # When we found figure generate local label (anchor) - if i.get("id"): - logging.debug("place figure label") - i.insert_before(self.make_label(soup, i.get("id"))) - figure = soup.new_tag("figure") - img = i.find("img") - cap = i.find("span", class_="figcap") - if cap is not None: - cap.name = "figcaption" - figure.append(cap) - if img: - # Store all referred images for copying - self.doc_images.add(img["src"]) - img["src"] = ( - "/_static/images/" - + os.path.basename(img["src"]) - ) - del img["width"] - del img["height"] - del img["class"] - del img["title"] - del img["name"] - del img["id"] - figure.append(img) - i.replace_with(figure) - elif "section" in i.get("class", []): - # Sections - if i.get("id"): - # When we found section generate local label (anchor) - sec_id = i.get("id").lower() - if self.is_element_referred(sec_id, file_name): - page_anchors.add(sec_id) - i.insert_before(self.make_label(soup, sec_id)) - i.unwrap() - elif i.get("id") and i.get("id").startswith("body"): - i.unwrap() - else: - i.name = "p" - - # Process remaining images - for img in soup.body.find_all("img"): - if img["src"] and not img["src"].startswith("/_static/images"): - self.doc_images.add(img["src"]) - img["src"] = "/_static/images/" + os.path.basename(img["src"]) - del img["width"] - del img["height"] - del img["class"] - del img["title"] - del img["id"] - - # Drop strong in table headers "/" - for th in soup.body.find_all("th"): - if th.p and th.p.strong: - th.p.strong.unwrap() - - if args and args.improve_table_headers: - # Add spaces around "/" - for th in soup.body.find_all("th"): - if hasattr(th, "p") and th.p.string: - th.p.string = re.sub(r"\b/\b", " / ", th.p.string) - - # Drop strong around links "/" - for strong in soup.body.find_all("strong"): - if strong.a: - strong.unwrap() - - # table anchors - some tables are referred. Some are having anchor in - # front, some not. In order to cope with that we analyze every table - # and if it is referred - prepend anchor. Next anchor processing will - # skiip it, since such anchor is already placed on the page - for table in soup.body.find_all("table"): - # Verify this is really called from somewhere: - if table.get("id"): - local_ref = table["id"].lower() - if self.is_element_referred(local_ref, file_name): - # We now know something in the document wants this anchor - - # replace it with label - if local_ref not in page_anchors: - lnk = bs4.BeautifulSoup( - f"

..\\_{local_ref}:

", "html.parser" - ) - table.insert_before(lnk) - page_anchors.add(local_ref) - else: - logging.debug( - "Not placing replaced anchor %s " - "since it already existed", - local_ref, - ) - - # local anchors - for lnk in soup.body.find_all("a"): - if ( - lnk.string is None - and lnk.has_attr("name") - and not re.match(r"^li\d+$", lnk.attrs["name"]) - # anywhere section - and not re.match(r".*section\d+$", lnk.attrs["name"]) - # starts with table - and not re.match(r"^table\d+$", lnk.attrs["name"]) - ): - # Verify this is really called from somewhere: - local_ref = lnk["name"].lower() - if self.is_element_referred(local_ref, file_name): - # We now know something in the document wants this anchor - - # replace it with label - if local_ref not in page_anchors: - logging.debug("Adding anchor") - lnk.name = "p" - lnk.string = f"..\\_{local_ref}:" - del lnk["name"] - page_anchors.add(local_ref) - else: - logging.debug( - "Not placing replaced anchor %s " - " since it already existed", - local_ref, - ) - else: - logging.debug("Dropping unreferred link %s", lnk) - - # Undeline element should not be used at all - for underline in soup.body.find_all("u"): - underline.unwrap() - - for li in soup.body.find_all("li"): - if not li.get("id"): - continue - local_ref = li.get("id").lower() - # Delete li ID to prevent broken RST containers - del li["id"] - - if ( - self.is_element_referred(local_ref, file_name) - and local_ref not in page_anchors - ): - # LI item referred, but no anchor present - logging.debug("Adding missing li %s anchor", local_ref) - li.insert(0, self.make_label(soup, local_ref)) - page_anchors.add(local_ref) - - # Sometimes we have code blocks with line numbers. - #
- # """ - expected = """""" - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual(str(res.find('tr')), expected.strip()) - - def test_streamline_html_escape_3(self): - test_data = """ - - """ - expected = """ - - """ - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual(str(res.find('td')), expected.strip()) - - def test_streamline_html_escape_4(self): - test_data = """ - - """ - expected = """ - - """ - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual(str(res.find('td')), expected.strip()) - - def test_streamline_html_escape_5(self): - test_data = """ -
  • "dss:*:get",
  • "dss:*:list",
  • "dss:*:count"
- """ - expected = """ -
  • "dss:``*``:get",
  • "dss:``*``:list",
  • "dss:``*``:count"
- """ - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual(str(res.find('ul')), expected.strip()) - - def test_streamline_html_escape_6(self): - test_data = """ -
  • "vpc:*:*"
- """ - expected = """ -
  • "vpc:``*``:``*``"
- """ - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual(str(res.find('ul')), expected.strip()) - - def test_streamline_html_escape_7(self): - test_data = """ -
  • vpc:securityGroups:*
  • vpc:securityGroupRules:*
- """ - expected = """ -
  • vpc:securityGroups:*
  • vpc:securityGroupRules:*
- """ - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual(str(res.find('ul')), expected.strip()) - - def test_streamline_html_escape_8(self): - test_data = """ -

cce:kubernetes:*

- """ - expected = """ -

cce:kubernetes:*

- """ - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual(str(res.find('p')), expected.strip()) - - def test_streamline_html_escape_9(self): - test_data = """ -
  • The folder name contains a maximum of 255 characters, and the - full path cannot exceed 1,023 characters.
  • -
  • The folder name cannot be empty.
  • -
  • The folder name cannot contain the following special characters: - /:*?"<>|\;&,'`!{}[]$%+
  • -
  • The value cannot start or end with a period (.).
  • -
  • The spaces at the beginning and end are ignored.
- """ # noqa - expected = """ -
  • The folder name contains a maximum of 255 characters, and the - full path cannot exceed 1,023 characters.
  • The folder name cannot be empty.
  • The folder name cannot contain the following special characters: /:*?"<>|\\;&,\'`!{}[]$%+
  • The value cannot start or end with a period (.).
  • The spaces at the beginning and end are ignored.
- """ # noqa - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual( - str(res.find('ul')).replace('\n', ''), - expected.strip().replace('\n', ''), - ) - - def test_streamline_html_escape_10(self): - test_data = """ -

ipFilterRules=allow:ip:127.*, allow:name:localhost, deny:ip:*

- """ # noqa - expected = """ -

ipFilterRules=allow:ip:127.*, allow:name:localhost, deny:ip:*

- """ # noqa - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual( - str(res.find('p')), - expected.strip(), - ) - - def test_streamline_html_escape_11(self): - test_data = """ - - """ # noqa - expected = """ - - """ # noqa - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual( - str(res.find('td')), - expected.strip(), - ) - - def test_streamline_html_escape_12(self): - test_data= """ -

DLI:*:*:database:databases.dbname

- """ # noqa - expected = """ -

DLI:``*``:``*``:database:databases.dbname

- """ # noqa - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual( - str(res.find('p')), - expected.strip(), - ) - - def test_streamline_html_escape_13(self): - test_data= """ -

DLI:*:*:queue:queues.*

- """ # noqa - expected = """ -

DLI:``*``:``*``:queue:queues.*

- """ # noqa - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual( - str(res.find('p')), - expected.strip(), - ) - - def test_streamline_html_escape_14(self): - test_data= """ -
  • Use /*+ MAPJOIN(join_table) */.
  • - """ # noqa - expected = """ -
  • Use /*+ MAPJOIN(join_table) */.
  • - """ # noqa - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual( - str(res.find('li')), - expected.strip(), - ) - - def test_streamline_html_escape_15(self): - test_data= """ -

    /*FORCE_MASTER*/: A SQL statement is routed to the primary DB instance.

    - """ # noqa - expected = """ -

    /*FORCE_MASTER*/: A SQL statement is routed to the primary DB instance.

    - """ # noqa - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual( - str(res.find('p')), - expected.strip(), - ) - - def test_streamline_html_escape_16(self): - test_data= """ - - """ # noqa - expected = """ - - """ # noqa - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual( - str(res.find('th')), - expected.strip(), - ) - - def test_streamline_html_escape_17(self): - test_data= """ - - """ # noqa - expected = """ - - """ # noqa - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual( - str(res.find('th')), - expected.strip(), - ) diff --git a/playbooks/pre.yaml b/playbooks/pre.yaml index eb93ac2b..2203f28c 100644 --- a/playbooks/pre.yaml +++ b/playbooks/pre.yaml @@ -5,12 +5,10 @@ - ensure-pip - ensure-virtualenv - role: "ensure-pandoc" - vars: - ensure_pandoc_version: "2.19.2" tasks: - name: Install convertor pip: chdir: "{{ zuul.project.src_dir }}" virtualenv: "{{ ansible_user_dir }}/.venv" - name: . + name: "{{ ansible_user_dir }}/{{ zuul.projects['gitea.eco.tsi-dev.otc-service.com/docs/doc-convertor'].src_dir }}" editable: "yes" diff --git a/setup.cfg b/setup.cfg index a7519989..df8c8162 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,11 +1,11 @@ [metadata] -name = otc-doc-convertor +name = otc-doc-exports author = Open Telekom Cloud - Ecosystem Squad author_email = dl-pbcotcdeleco@t-systems.com -description = Python program to convert docs exported in HTML into RST +description = Doc sources (HTML) to track changes in the vendors documentation system description_file = README.md -home_page = https://github.com/opentelekomcloud-docs/doc-exports +home_page = https://gitea.eco.tsi-dev.otc-service.com/docs/doc-exports classifier = License :: OSI Approved :: Apache Software License Operating System :: POSIX :: Linux @@ -17,12 +17,3 @@ classifier = Programming Language :: Python :: 3 Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 -keywords = Sphinx, search, python - -[options] -packages = otc_doc_convertor - -[options.entry_points] -console_scripts = - otc-convert-doc = otc_doc_convertor.convertor:main - otc-convert-compare = otc_doc_convertor.comparator:main diff --git a/templates/conf.py b/templates/conf.py index 8ce7e2dd..11f08baa 100644 --- a/templates/conf.py +++ b/templates/conf.py @@ -18,7 +18,7 @@ import os import sys extensions = [ - 'otcdocstheme' + 'otcdocstheme', ] otcdocs_auto_name = False diff --git a/zuul.yaml b/zuul.yaml index 0305fd5d..08748af8 100644 --- a/zuul.yaml +++ b/zuul.yaml @@ -10,6 +10,8 @@ pre-run: playbooks/pre.yaml run: playbooks/run.yaml post-run: playbooks/post.yaml + required-projects: + - name: docs/doc-convertor vars: docs_update_data_file: "metadata.yaml" @@ -20,7 +22,6 @@ Convert Application doc exports from html to rst and generate corresponding rst patch files. files: - - otc_doc_convertor/convertor.py - docs/aom - docs/cse - docs/dms @@ -39,7 +40,6 @@ Convert BigData doc exports from html to rst and generate corresponding rst patch files. files: - - otc_doc_convertor/convertor.py - docs/apig - docs/css - docs/dataartsstudio @@ -62,7 +62,6 @@ Convert Compute doc exports from html to rst and generate corresponding rst patch files. files: - - otc_doc_convertor/convertor.py - docs/as - docs/bms - docs/deh @@ -82,7 +81,6 @@ Convert Container doc exports from html to rst and generate corresponding rst patch files. files: - - otc_doc_convertor/convertor.py - docs/cce - docs/cci - docs/swr @@ -100,7 +98,6 @@ Convert Database doc exports from html to rst and generate corresponding rst patch files. files: - - otc_doc_convertor/convertor.py - docs/das - docs/dcs - docs/ddm @@ -125,7 +122,6 @@ Convert Network doc exports from html to rst and generate corresponding rst patch files. files: - - otc_doc_convertor/convertor.py - docs/cdn - docs/dc - docs/dns @@ -151,7 +147,6 @@ Convert Management doc exports from html to rst and generate corresponding rst patch files. files: - - otc_doc_convertor/convertor.py - docs/ces - docs/cts - docs/lts @@ -172,7 +167,6 @@ Convert Security doc exports from html to rst and generate corresponding rst patch files. files: - - otc_doc_convertor/convertor.py - docs/antiddos - docs/dbss - docs/iam @@ -193,7 +187,6 @@ Convert Storage doc exports from html to rst and generate corresponding rst patch files. files: - - otc_doc_convertor/convertor.py - docs/cbr - docs/evs - docs/obs @@ -208,22 +201,6 @@ vars: docs_service_category: "storage" -- job: - name: otc-doc-exports-compare-current-hc - parent: unittests - voting: false - description: | - Verify whether content is matching the one published in the current - HelpCenter. - files: - - docs - - otc_doc_convertor/comparator.py - - roles/compare - vars: - docs_update_data_file: "metadata.yaml" - pre-run: playbooks/pre.yaml - run: playbooks/compare.yaml - - project: merge-mode: squash-merge default-branch: main @@ -231,14 +208,11 @@ docs_rst_location: "docs" docs_base_location: "base" docs_new_location: "new" - ensure_pandoc_version: "3.1" + ensure_pandoc_version: "2.19.2" propose_change_git_provider: "gitea" propose_change_git_baseurl: "gitea.eco.tsi-dev.otc-service.com" check: jobs: - - otc-tox-py39 - - otc-tox-pep8: - nodeset: ubuntu-focal - otc-doc-exports-convert-application - otc-doc-exports-convert-big-data - otc-doc-exports-convert-compute @@ -284,11 +258,8 @@ dependencies: - name: otc-doc-exports-convert-storage soft: true - # - otc-doc-exports-compare-current-hc gate: jobs: - - otc-tox-pep8: - nodeset: ubuntu-focal - otc-doc-exports-convert-application - otc-doc-exports-convert-big-data - otc-doc-exports-convert-compute
    1
    -        # 2
    -        # 3
    ....
    -        # 
    - for td_lines in soup.body.find_all("td", "linenos"): - codeblock = td_lines.parent.find("td", "code") - table = td_lines.find_parent("table") - if codeblock and table: - # Replace whole table with only codeblock td - logging.debug("Replace %s with %s" % (table, codeblock)) - codeblock.name = "pre" - del codeblock["class"] - table.replace_with(codeblock) - - for pre in soup.body.find_all("pre"): - text = pre.get_text() - # if text.startswith("{"): - # pre["class"] = "data" - if re.search(r"\[[a-z]*@\w+.*\][\s#>]?", text): - # Something like "[root@ecs-test-0001 ~]#" - pre["class"] = "console" - elif re.match(r"^(GET|PUT|POST|DELETE)", text): - # Something like "DELETE https://some_url" - pre["class"] = "text" - if "codeblock" in pre.get("class", []): - #
    {em.string}"
    -                em.replace_with(bs4.BeautifulSoup(new, "html.parser"))
    -
    -        # Incredibly dirty hacks:
    -        rawize_expressions = [
    -            # DWS Dev Guide harcodes
    -            r"^(1\|\"iamtext\"\|\"iamvarchar\"\|2006-07-07\|12:00:00)$",
    -            r"^(2\|\"iamtext\"\|\"iamvarchar\"\|2022-07-07\|19:00:02)$",
    -            r"(\*max_files_per_process\*3)",
    -            r"(&&, &&&, .* <#>)",
    -            # r"(\*\+)",
    -            r"(-\|-)",
    -            r"(^-{8}$)"
    -        ]
    -        self.rawize_me(soup, rawize_expressions)
    -
    -        # Special asterisks treatement
    -        escape_asterisk_re = r"\((\*)[\.,]"
    -        self.rawize_me(soup, [escape_asterisk_re])
    -
    -        # And now remaining specialities
    -        rawize_strings = [
    -            # "\*\*\*\*\*\*",
    -            # r"([\\\/\:\*\?\"\~|<>]{4,})"
    -            # ModelArts UMN contain this "unallowed" sequence
    -            r"(\\/:\*\?\"<>\|)",
    -            # CSS UMN contain "SELECT exclude('*name') FROM my-index"
    -            r"(\*name)",
    -            # DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?)
    -            r"\(([\W\x60_]{10,})\)",
    -            # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
    -            r"\s([^a-zA-Z0-9\s]{8,})",
    -            # MRS operation guide contain: /*+ MAPJOIN(join_table) \*/
    -            # RDS UMN contain: /*FORCE_MASTER*/
    -            r"(/\*.{5,}\*/)",
    -            # BMS API contain sequence in a dedicated paragraph
    -            r"^([^a-zA-Z0-9\s]{10,})$",
    -            # OBS special chars - "\$" "\\" etc
    -            r"^(\\[\$\\bfnrtvu]{1})$",
    -            # CES contains: urn:smn:([a-z]|[A-Z]|[0-9]|\\-){1,32}:....
    -            r"\s(urn:smn:\(.*)\.",
    -            # "-" only (in tables) is considered as list
    -            r"^(-)$",
    -            # MRS component guide has: "./mydate_\\\\d*/"
    -            r"\w(_)\\",
    -            # Pandoc is not properly escaping _ before special chars what makes
    -            # it invalid for Sphinx. A bit weird regex:
    -            # "[:space:][:word:][:underscore:][:comma:]"
    -            r"\s([\w_]+_)[,]",
    -            # DWS dirty-fixes part 2
    -            r"/(\*\+)",
    -        ]
    -        self.rawize_me(soup, rawize_strings)
    -
    -        # Pandoc seem to be not escaping properly asterisks which are
    -        # immediately following non word chars
    -        # (https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#toc-entry-44)
    -        # NOTE(gtema):
    -        # 1. this is on purpose placed here since we want to have some special
    -        # escapings above
    -        # 2. we are not escaping asterisks at the end of the paragraphs (pandoc
    -        # deals correctly with that)
    -        re_escape = re.compile(r"([-/'\"<\([{])(\*+)(.)")
    -        for p in soup.body.find_all(string=re_escape):
    -            if p.string and p.parent.name == "p":
    -                p.string.replace_with(
    -                    re.sub(re_escape, r"\1``\2``\3", p.string))
    -
    -        # Special case for multiple asterisks and colons like ecs:*:*
    -        re_escape = re.compile(r"([:])(\*+)")
    -        re_escape_new = re.compile(r"([:])(\*)[^$]")
    -        for p in soup.body.find_all(string=re_escape):
    -            if p.string and (p.parent.name == "p" or p.parent.name == "li"):
    -                string = p.string
    -                while re.search(re_escape, string):
    -                    if re.search(re_escape_new, string):
    -                        string = re.sub(
    -                            re_escape, r"\1``\2``", string, count=1)
    -                    else:
    -                        break
    -                p.string.replace_with(string)
    -
    -        # Drop parent link at the bottom of the page
    -        for parent in soup.body.find_all("p", class_="familylinks"):
    -            parent.decompose()
    -
    -        return soup.body
    -
    -    def main(self):
    -        logging.basicConfig(level=logging.DEBUG)
    -        parser = argparse.ArgumentParser(description="Process links.")
    -        parser.add_argument("path", type=str, help="path to the files")
    -        parser.add_argument(
    -            "--improve-table-headers",
    -            action="store_true",
    -            help="Improve table headers by enforcing spaces around `/`",
    -        )
    -        parser.add_argument(
    -            "--pygments-lexer", help="Set particular code-block lexer language"
    -        )
    -        parser.add_argument(
    -            "--dest", help="Directory to write resulting files"
    -        )
    -        parser.add_argument("--title", required=True, help="Document title")
    -        parser.add_argument(
    -            "--service", help="Service to which the document belongs to"
    -        )
    -        parser.add_argument("--repo-name", help="Service repository")
    -        parser.add_argument("--pdf-name", help="PDF File name")
    -        parser.add_argument(
    -            "--templates-location",
    -            default="templates",
    -            help="Location of additional templates",
    -        )
    -        self.args = parser.parse_args()
    -        if self.args.dest:
    -            dest = pathlib.Path(self.args.dest)
    -        else:
    -            dest = pathlib.Path(self.args.path, "result")
    -        dest.mkdir(parents=True, exist_ok=True)
    -
    -        metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json")
    -        meta_data = dict()
    -
    -        if not metadata_file.exists():
    -            logging.warning(
    -                f"CLASS.TXT.json file is missing in {self.args.path}, "
    -                f"assuming initial import"
    -            )
    -            with open(pathlib.Path(dest, "index.rst"), "w") as index:
    -                index.write("=" * (len(self.args.title)) + "\n")
    -                index.write(self.args.title + "\n")
    -                index.write("=" * (len(self.args.title)) + "\n")
    -                index.write("\n")
    -        else:
    -            meta_data = json.loads(open(metadata_file).read())
    -        metadata_by_uri = dict()
    -        metadata_by_code = dict()
    -        self.doc_images = set()
    -        for f in meta_data:
    -            f["new_name"] = self.get_new_name(f["title"])
    -            metadata_by_uri[f["uri"]] = f
    -            metadata_by_code[f.get("code")] = f
    -
    -        tree = self.build_doc_tree(metadata_by_code)
    -
    -        pathlib.Path(self.args.path, "temp/").mkdir(
    -            parents=True, exist_ok=True
    -        )
    -
    -        # Scan all docs for anchors
    -        for f in pathlib.Path(self.args.path).glob("*.html"):
    -            if f.name not in metadata_by_uri:
    -                continue
    -            # Registering section links
    -            with open(f, "r") as reader:
    -                logging.debug(f"Scanning {f.name}")
    -                content = reader.read()
    -                soup = bs4.BeautifulSoup(content, "lxml")
    -                for lnk in soup.body.find_all("a"):
    -                    if "name" in lnk.attrs and lnk.string is None:
    -                        anchor = lnk.attrs["name"]
    -                        title = re.sub("[ _:]", "-", anchor)
    -                        res = dict(
    -                            fname=f.name.lower(),
    -                            title=title,
    -                            replace=title.lower()
    -                        )
    -                        self.doc_anchors[anchor] = res
    -                    if "href" in lnk.attrs and lnk["href"]:
    -                        self.doc_links[lnk["href"].lower()] = f.name
    -
    -        for f in pathlib.Path(self.args.path).glob("*.html"):
    -            if f.name not in metadata_by_uri:
    -                continue
    -            _target = metadata_by_uri[f.name]
    -            target = _target["new_name"]
    -            target_path = self.get_target_path(
    -                _target["p_code"], metadata_by_code
    -            )
    -            pathlib.Path(self.args.path, "temp").mkdir(
    -                parents=True, exist_ok=True
    -            )
    -            pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
    -                parents=True, exist_ok=True
    -            )
    -            pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True)
    -
    -            # Pre-processing of html content
    -            with open(f, "r") as reader, open(
    -                pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w"
    -            ) as writer:
    -                # if f.name not in [
    -                # ]:
    -                #     continue
    -                logging.info(f"Pre-Processing {f} as {target}")
    -                content = reader.read()
    -                # Preprocess - Fix space inside link and not text
    -                # i.e. `

    Some link in text

    - content = re.sub(r"\s", " ", content) - content = re.sub(r"√", "Y", content) - content = re.sub(r"×", "x", content) - content = re.sub(r"π", "Pi", content) - content = re.sub(r"–", "-", content) - content = re.sub(r"≤", "<=", content) - content = re.sub(r"≥", ">=", content) - soup = bs4.BeautifulSoup(content, "lxml") - proc = self.streamline_html(soup, f.name, self.args) - - for lnk in proc.find_all("a"): - href = lnk.get("href") - if href and not href.startswith("http"): - # Internal link - replace with :ref: - code = soup.new_tag("code") - code["class"] = "interpreted-text" - code["role"] = "ref" - href_parts = href.split("#") - if len(href_parts) > 1: - # for anchor just use anchor ref - link_target = href_parts[1].lower() - else: - # for other page - use only page name - link_target = ( - href_parts[0].replace(".html", "").lower() - ) - if link_target: - # Looks like an anchor on the same page - code.string = f"{lnk.string} <{link_target}>" - logging.debug(f" replace {lnk} with {code}") - lnk.replace_with(code) - - logging.info(f"Saving file {writer.name}") - writer.write(str(proc)) - - # Convert html to rst - os.system( - f"pandoc '{self.args.path}/temp/{target}.tmp' -f html " - f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' " - f"--ascii -s --wrap none" - ) - # Post processing of rendered rst - with open( - f"{self.args.path}/tmp_result/" f"{target_path}/{target}.rst", - "r", - ) as reader, open( - pathlib.Path(dest, target_path, f"{target}.rst"), "w" - ) as writer: - logging.info(f"Post processing {target}...") - writer.write(f":original_name: {f.name}\n\n") - # Add root file label - writer.write(f".. _{f.name.replace('.html', '')}:\n\n") - # post process some usual stuff - for line in reader.readlines(): - processed_line = re.sub( - r"\.\.\\\\_(.*):$", r".. _\1:", line) - # replace anchor when it is itself inside some other block - # (i.e. table) - processed_line = re.sub( - r"\.\.\\\\_(.*):\s", r".. _\1: ", processed_line) - # For some reason regex locally and in zuul are not - # behaving same - thus same but different - processed_line = re.sub( - r"\.\.\\_(.*):$", r".. _\1:", processed_line) - # replace anchor when it is itself inside some other block - # (i.e. table) - processed_line = re.sub( - r"\.\.\\_(.*):\s", r".. _\1: ", processed_line) - # We could get unwanted anchors from pandoc - get rid of - # them - anchor = re.search(r"\.\. \_(.*):", processed_line) - if anchor and len(anchor.groups()) > 0: - if not self.is_element_referred( - anchor.group(1), f.name - ): - # This is most likely some duplicated anchor. It is - # not referred from any other place so drop it - logging.info( - "Dropping not referred anchor '%s'", - anchor.group(1)) - continue - - processed_line = re.sub( - r"public_sys-resources/", "", processed_line - ) - processed_line = re.sub( - r" :name: .*$", "", processed_line - ) - processed_line = re.sub( - r"\*\*Parent topic:.*$", "", processed_line - ) - processed_line = re.sub( - r".. code:: screen$", - r".. code-block::", - processed_line, - ) - for lexer in ["json", "bash", "text", "console"]: - processed_line = re.sub( - f".. code:: {lexer}$", - f".. code-block:: {lexer}", - processed_line, - ) - if re.match(rf".. code:: {lexer}\s", processed_line): - logging.error( - f"'code-block: {lexer}' with something " - "afterwards" - ) - exit(1) - # spaces are important, since code-block may reside inside - # of the cell - processed_line = re.sub( - r".. code:: screen\s", - r".. code-block:: ", - processed_line, - ) - processed_line = re.sub( - r".. code:: codeblock$", - r".. code-block::", - processed_line, - ) - processed_line = re.sub(r"[ \t]*$", "", processed_line) - writer.write(processed_line) - - # Generate indexes - for k, v in tree.items(): - path = "" - title = self.args.title - page_label = "" - if k != 0: - curr = metadata_by_code[k] - title = curr["title"] - page_label = curr["uri"].replace(".html", "").lower() - path = self.get_target_path(curr["code"], metadata_by_code) - - p = pathlib.Path(dest, f"{path}.rst") - if p.exists(): - logging.warning( - f"{p.resolve()} is renamed into {path}/index.rst" - ) - # Update existing index file - p.rename(pathlib.Path(dest, f"{path}/index.rst")) - with open(pathlib.Path(dest, path, "index.rst"), "a") as index: - index.write("\n") - index.write(".. toctree::\n") - index.write(" :maxdepth: 1\n") - index.write(" :hidden: \n\n") - for child in v: - new_name = child["new_name"] - if child["code"] in tree: - # If this is folder - add /index - new_name = new_name + "/index" - index.write(f" {new_name}\n") - else: - with open(pathlib.Path(dest, path, "index.rst"), "w") as index: - # New index file - if page_label: - index.write(f".. _{page_label}:\n\n") - index.write("=" * (len(title)) + "\n") - index.write(title + "\n") - index.write("=" * (len(title)) + "\n") - index.write("\n") - index.write(".. toctree::\n") - index.write(" :maxdepth: 1\n\n") - for child in v: - new_name = child["new_name"] - if child["code"] in tree: - # If this is folder - add /index - new_name = new_name + "/index" - index.write(f" {new_name}\n") - # Copy used images - if len(self.doc_images) > 0: - logging.debug("Processing images...") - img_dest = pathlib.Path(dest, "_static", "images") - img_dest.mkdir(parents=True, exist_ok=True) - for img in self.doc_images: - shutil.copyfile( - pathlib.Path(self.args.path, img).resolve(strict=False), - pathlib.Path(img_dest, os.path.basename(img)).resolve( - strict=False - ), - ) - - context = dict( - title=self.args.title, - project=self.args.service, - repo_name=self.args.repo_name, - pdf_name=self.args.pdf_name, - ) - loader = FileSystemLoader([self.args.templates_location]) - env = Environment(loader=loader, autoescape=select_autoescape()) - for f in loader.list_templates(): - outfile_tmpl = env.get_template(f) - outfile_rendered = outfile_tmpl.render(**context) - target_file = pathlib.Path(self.args.dest, f) - target_file.parent.mkdir(parents=True, exist_ok=True) - with open(target_file, "w", encoding="utf-8", newline="") as out: - logging.debug(f"Generating {f} from template...") - out.write(outfile_rendered) - - -def main(): - OTCDocConvertor().main() - - -if __name__ == "__main__": - main() diff --git a/otc_doc_convertor/tests/unit/__init__.py b/otc_doc_convertor/tests/unit/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/otc_doc_convertor/tests/unit/test_convertor.py b/otc_doc_convertor/tests/unit/test_convertor.py deleted file mode 100644 index 82b40287..00000000 --- a/otc_doc_convertor/tests/unit/test_convertor.py +++ /dev/null @@ -1,249 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# -from unittest import TestCase -import bs4 - -from otc_doc_convertor import convertor - - -class TestConvertor(TestCase): - def setUp(self): - self.convertor = convertor.OTCDocConvertor() - - def test_streamline_html_escape_1(self): - test_data = """ -
    • ObjectCreated:*
    - """ - expected = """ -
    • ObjectCreated:*
    """ - soup = bs4.BeautifulSoup(test_data, 'lxml') - res = self.convertor.streamline_html(soup, "dummy") - self.assertEqual(str(res.find('ul')), expected.strip()) - - def test_streamline_html_escape_2(self): - test_data = """

    s3:*

    s3:*

    If the VPC, subnet, and security group are displayed in the DB - instance list, you need to configure vpc:*:get and - vpc:*:list.

    If the VPC, subnet, and security group are displayed in the DB - instance list, you need to configure vpc:``*``:get and - vpc:``*``:list.

    • You must have VPC-related permissions when creating a n SFS - Turbo instance, including the permissions for verifying VPCs, subnets, - and security groups, creating virtual IP addresses and ports, and - creating security group rules. You must add the following - action:
      • "vpc:*:*"
    • You must have VPC-related permissions when creating a n SFS - Turbo instance, including the permissions for verifying VPCs, subnets, - and security groups, creating virtual IP addresses and ports, and - creating security group rules. You must add the following - action:
      • "vpc:``*``:``*``"

    If the VPC, subnet, and security group are displayed in the DB instance list, you need to configure vpc:*:get and vpc:*:list.

    If the VPC, subnet, and security group are displayed in the DB instance list, you need to configure vpc:``*``:get and vpc:``*``:list.

    Mandatory

    Mandatory

    MandatoryMandatory