diff --git a/bindep.txt b/bindep.txt
new file mode 100644
index 00000000..4a59b54c
--- /dev/null
+++ b/bindep.txt
@@ -0,0 +1 @@
+pandoc
diff --git a/otc_doc_convertor/__init__.py b/otc_doc_convertor/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/otc_doc_convertor/convertor.py b/otc_doc_convertor/convertor.py
new file mode 100644
index 00000000..44be6afe
--- /dev/null
+++ b/otc_doc_convertor/convertor.py
@@ -0,0 +1,426 @@
+#!/usr/bin/env python3
+
+import argparse
+import bs4
+import json
+import logging
+import os
+import pathlib
+import re
+
+
+class OTCDocConvertor:
+
+ def __init__(self):
+ self.doc_anchors = dict()
+ self.doc_links = dict()
+
+ @staticmethod
+ def get_new_name(current_name):
+ new_name = current_name.replace(' - ', '_')
+ new_name = new_name.replace(' ', '_')
+ new_name = new_name.replace('/', '_')
+ new_name = new_name.replace('\'', '')
+ new_name = new_name.replace('"', '')
+ new_name = new_name.replace('`', '')
+ new_name = new_name.replace('´', '')
+ new_name = new_name.replace(':', '')
+ new_name = new_name.replace('?', '')
+ new_name = new_name.replace('(', '')
+ new_name = new_name.replace(')', '')
+ new_name = new_name.lower()
+ return new_name
+
+ @staticmethod
+ def build_doc_tree(metadata):
+ flat_tree = dict()
+ for k, v in metadata.items():
+ parent_id = v.get('p_code')
+ if not parent_id:
+ parent_id = 0
+
+ if parent_id not in flat_tree:
+ flat_tree[parent_id] = list()
+ flat_tree[parent_id].append(v)
+ return flat_tree
+
+ @classmethod
+ def get_target_path(cls, code, metadata, path=''):
+ if code in metadata:
+ current = metadata[code]
+ if not current.get('p_code'):
+ return current['new_name']
+ else:
+ return (
+ "{0}/{1}".format(
+ cls.get_target_path(current['p_code'], metadata),
+ current['new_name'])
+ )
+ else:
+ return ''
+
+ def make_label(self, soup, name):
+ label = soup.new_tag("p")
+ label.string = f"..\\_{name.lower()}:"
+ return label
+
+ def is_element_referred(self, ref, fname):
+ return (
+ ref in self.doc_links
+ or '#' + ref in self.doc_links
+ or fname + '#' + ref in self.doc_links
+ )
+
+ def streamline_html(self, soup, file_name):
+ # Drop eventual header duplicated anchors
+ fname = file_name.replace(".html", "").lower()
+ met_page_anchors = dict()
+ for lnk in soup.body.find_all("a"):
+ name = None
+ if "name" in lnk.attrs and lnk.string is None:
+ name = lnk.attrs["name"].lower()
+ if name in met_page_anchors:
+ # Such anchor already existed on this page, drop it
+ lnk.decompose()
+ met_page_anchors[name] = True
+
+ if name and name.lower() == fname:
+ lnk.decompose()
+
+ # Process divs
+ for i in soup.body.find_all('div'):
+ if "note" in i.get('class', []):
+ # Notes
+ del i['id']
+ if i.img:
+ i.img.decompose()
+ notetitle = i.find('span', class_='notetitle')
+ if notetitle:
+ title = soup.new_tag('div')
+ title['class'] = 'title'
+ title.string = 'Note:'
+ notetitle.replace_with(title)
+ elif "notice" in i.get('class', []):
+ # Notices
+ del i['id']
+ if i.img:
+ i.img.decompose()
+ i['class'] = 'important'
+ elif "caution" in i.get('class', []):
+ # Cautions
+ del i['id']
+ if i.img:
+ i.img.decompose()
+ elif "fignone" in i.get('class', []):
+ # Figures
+ # When we found figure generate local label (anchor)
+ if i.get('id'):
+ logging.debug('place figure label')
+ i.insert_before(self.make_label(soup, i.get("id")))
+ figure = soup.new_tag('figure')
+ img = i.find('img')
+ cap = i.find('span', class_='figcap')
+ if cap is not None:
+ cap.name = 'figcaption'
+ figure.append(cap)
+ if img:
+ img['src'] = '/_static/images/' + img['src']
+ figure.append(img)
+ i.replace_with(figure)
+ elif "section" in i.get('class', []):
+ # Sections
+ # When we found section generate local label (anchor)
+ if i.get('id'):
+ sec_id = i.get("id").lower()
+ if self.is_element_referred(sec_id, file_name):
+ logging.debug('Add section label')
+ i.insert_before(self.make_label(soup, sec_id))
+ # and still convert to paragraph
+ i.name = 'p'
+ else:
+ i.name = 'p'
+
+ # Drop strong in table headers "/"
+ for th in soup.body.find_all('th'):
+ if th.p.strong:
+ th.p.strong.unwrap()
+
+ if self.args.improve_table_headers:
+ # Add spaces around "/"
+ for th in soup.body.find_all('th'):
+ if hasattr(th, 'p') and th.p.string:
+ th.p.string = re.sub(
+ r'\b/\b',
+ ' / ',
+ th.p.string)
+
+ # local anchors
+ for lnk in soup.body.find_all("a"):
+ if (
+ lnk.string is None
+ and hasattr(lnk, "name")
+ and not re.match(r"^li\d+$", lnk.attrs["name"])
+ # anywhere section
+ and not re.match(r".*section\d+$", lnk.attrs["name"])
+ # starts with table
+ and not re.match(r"^table\d+$", lnk.attrs["name"])
+ ):
+ # Verify this is really called from somewhere:
+ local_ref = lnk["name"].lower()
+ if self.is_element_referred(local_ref, file_name):
+ # We now know something in the document wants this anchor -
+ # replace it with label
+ lnk.name = "p"
+ lnk.string = f"..\\_{local_ref}:"
+ del lnk["name"]
+ else:
+ logging.debug("Dropping unreferred link")
+
+ for li in soup.body.find_all("li"):
+ del li['id']
+
+ for pre in soup.body.find_all("pre"):
+ text = pre.get_text()
+ # if text.startswith("{"):
+ # pre["class"] = "data"
+ if re.search(
+ r'\[[a-z]*@\w+.*\][\s#>]?',
+ text
+ ):
+ # Something like "[root@ecs-test-0001 ~]#"
+ pre["class"] = "console"
+ elif re.match(
+ r'^(GET|PUT|POST|DELETE)',
+ text
+ ):
+ # Something like "DELETE https://some_url"
+ pre["class"] = "text"
+
+ # And now specialities
+ rawize_strings = [
+ # "\*\*\*\*\*\*",
+ # r"([\\\/\:\*\?\"\~|<>]{4,})"
+ ]
+ for to_rawize in rawize_strings:
+ for p in soup.body.find_all(string=re.compile(to_rawize)):
+ if p.string:
+ curr = p.string
+ part = re.search(to_rawize, curr)
+ if len(part.groups()) > 0:
+ new = curr.replace(
+ part.group(1),
+ f"{part.group(1)}
"
+ )
+ p.replace_with(bs4.BeautifulSoup(new, 'html.parser'))
+ print(part.group(1))
+ print(f"New content is {p.string}")
+ logging.error(f"String with star: {p}")
+
+ return soup.body
+
+ def main(self):
+ logging.basicConfig(level=logging.DEBUG)
+ parser = argparse.ArgumentParser(description='Process links.')
+ parser.add_argument(
+ 'path', type=str, help='path to the files')
+ parser.add_argument(
+ '--improve-table-headers', action='store_true',
+ help='Improve table headers by enforcing spaces around `/`')
+ parser.add_argument(
+ '--pygments-lexer',
+ help='Set particular code-block lexer language')
+ parser.add_argument(
+ '--dest',
+ help='Directory to write resulting files')
+ self.args = parser.parse_args()
+ retval = os.getcwd()
+ meta_data = json.loads(open(
+ pathlib.Path(self.args.path, "CLASS.TXT.json")
+ ).read())
+ metadata_by_uri = dict()
+ metadata_by_code = dict()
+ if self.args.dest:
+ dest = pathlib.Path(self.args.dest)
+ else:
+ dest = pathlib.Path(self.args.path, 'result')
+
+ for f in meta_data:
+ f['new_name'] = self.get_new_name(f['title'])
+ metadata_by_uri[f['uri']] = f
+ metadata_by_code[f.get('code')] = f
+
+ tree = self.build_doc_tree(metadata_by_code)
+
+ pathlib.Path(self.args.path, "temp/").mkdir(
+ parents=True, exist_ok=True)
+
+ # Scan all docs for anchors
+ for f in pathlib.Path(self.args.path).glob("*.html"):
+ if f.name not in metadata_by_uri:
+ continue
+ # Registering section links
+ with open(f, 'r') as reader:
+ logging.debug(f"Scanning {f.name}")
+ content = reader.read()
+ soup = bs4.BeautifulSoup(content, "lxml")
+ for lnk in soup.body.find_all('a'):
+ if "name" in lnk.attrs and lnk.string is None:
+ anchor = lnk.attrs["name"]
+ title = re.sub('[ _:]', '-', anchor)
+ res = dict(
+ fname=f.name,
+ title=title,
+ replace=title.lower()
+ )
+ self.doc_anchors[anchor] = res
+ if "href" in lnk.attrs and lnk["href"]:
+ self.doc_links[lnk["href"].lower()] = f.name
+
+ for f in pathlib.Path(self.args.path).glob("*.html"):
+ if f.name not in metadata_by_uri:
+ continue
+ _target = metadata_by_uri[f.name]
+ target = _target['new_name']
+ target_path = self.get_target_path(
+ _target['p_code'], metadata_by_code)
+ pathlib.Path(self.args.path, "temp").mkdir(
+ parents=True, exist_ok=True)
+ pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
+ parents=True, exist_ok=True)
+ pathlib.Path(dest, target_path).mkdir(
+ parents=True, exist_ok=True)
+
+ # Pre-processing of html content
+ with open(f, 'r') as reader, \
+ open(pathlib.Path(self.args.path,
+ f"temp/{target}.tmp"), 'w') as writer:
+ # if f.name not in [
+ # "modelarts_21_0031.html",
+ # "en-us_topic_0032380449.html"]:
+ # continue
+ logging.info(f"Pre-Processing {f} as {target}")
+ content = reader.read()
+ soup = bs4.BeautifulSoup(content, "lxml")
+ proc = self.streamline_html(soup, f.name)
+
+ for lnk in proc.find_all("a"):
+ href = lnk.get('href')
+ if href and not href.startswith('http'):
+ # Internal link - replace with :ref:
+ code = soup.new_tag('code')
+ code['class'] = "interpreted-text"
+ code['role'] = "ref"
+ href_parts = href.split('#')
+ if len(href_parts) > 1:
+ # for anchor just use anchor ref
+ link_target = href_parts[1].lower()
+ else:
+ # for other page - use only page name
+ link_target = href_parts[0].replace(
+ ".html", "").lower()
+ if link_target:
+ # Looks like an anchor on the same page
+ code.string = f"{lnk.string} <{link_target}>"
+ logging.debug(f" replace {lnk} with {code}")
+ lnk.replace_with(code)
+
+ # Drop parent link at the bottom of the page
+ for parent in proc.find_all("p", class_="parentlink"):
+ parent.decompose()
+
+ logging.info(f'Saving file {writer.name}')
+ writer.write(str(proc))
+
+ # Convert html to rst
+ os.system(
+ f"pandoc '{self.args.path}/temp/{target}.tmp' -f html "
+ f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' "
+ f"--ascii -s --wrap none"
+ )
+ # Post processing of rendered rst
+ with open(f"{self.args.path}/tmp_result/"
+ f"{target_path}/{target}.rst", 'r') \
+ as reader, \
+ open(pathlib.Path(dest, target_path,
+ f"{target}.rst"), 'w') as writer:
+ logging.info(f"Post processing {target}")
+ # writer.write(f":original_name: {f.name}\n\n")
+ # Add root file label
+ writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
+ # post process some usual stuff
+ for line in reader.readlines():
+ processed_line = re.sub(r'\.\.\\_', '.. _', line)
+ processed_line = re.sub(r'√', 'Y', processed_line)
+ processed_line = re.sub(
+ r'public_sys-resources/', '', processed_line)
+ processed_line = re.sub(
+ r'image:: ', 'image:: /_static/images/',
+ processed_line)
+ processed_line = re.sub(
+ r' :name: .*$', '', processed_line)
+ processed_line = re.sub(
+ r'\*\*Parent topic:.*$', '', processed_line)
+ processed_line = re.sub(
+ r'.. code:: screen$',
+ r'.. code-block::', processed_line)
+ # for lexer in ["json", "bash", "text", "console"]:
+ # processed_line = re.sub(
+ # f".. code:: {lexer}$",
+ # f".. code-block:: {lexer}", processed_line)
+ # if re.match(rf".. code:: {lexer}\s", processed_line):
+ # logging.error(
+ # f"'code-block: {lexer}' with something "
+ # "afterwards")
+ # exit(1)
+ # spaces are important, since code-block may reside inside
+ # of the cell
+ processed_line = re.sub(
+ r'.. code:: screen\s',
+ r'.. code-block:: ', processed_line)
+ processed_line = re.sub(
+ r'.. code:: codeblock$',
+ r'.. code-block::', processed_line)
+ writer.write(processed_line)
+
+ # Generate indexes
+ for k, v in tree.items():
+ path = ''
+ title = 'Main Index'
+ page_label = ''
+ if k != 0:
+ curr = metadata_by_code[k]
+ title = curr['title']
+ page_label = curr['uri'].replace(".html", "").lower()
+ path = self.get_target_path(curr['code'], metadata_by_code)
+ with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
+ if page_label:
+ index.write(f".. _{page_label}:\n\n")
+ index.write('=' * (len(title)) + '\n')
+ index.write(title + '\n')
+ index.write('=' * (len(title)) + '\n')
+ index.write('\n')
+ index.write('.. toctree::\n')
+ index.write(' :maxdepth: 1\n\n')
+ for child in v:
+ new_name = child['new_name']
+ if child['code'] in tree:
+ # If this is folder - add /index
+ new_name = new_name + '/index'
+ index.write(f" {new_name}\n")
+
+ p = pathlib.Path(dest, f"{path}.rst")
+ if p.exists():
+ logging.warning(
+ f"{p.resolve()} is removed in favour"
+ f" of result/{path}/index.rst")
+ p.unlink()
+
+ os.chdir(retval)
+
+
+def main():
+ OTCDocConvertor().main()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/otc_doc_convertor/process.py b/otc_doc_convertor/process.py
new file mode 100644
index 00000000..d2e5e76d
--- /dev/null
+++ b/otc_doc_convertor/process.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+
+import argparse
+import bs4
+import json
+import logging
+import os
+import pathlib
+import re
+
+
+class OTCDocConvertor:
+
+ def __init__(self):
+ self.doc_anchors = dict()
+ self.doc_links = dict()
+
+ @staticmethod
+ def get_new_name(current_name):
+ new_name = current_name.replace(' - ', '_')
+ new_name = new_name.replace(' ', '_')
+ new_name = new_name.replace('/', '_')
+ new_name = new_name.replace('\'', '')
+ new_name = new_name.replace('"', '')
+ new_name = new_name.replace('`', '')
+ new_name = new_name.replace('´', '')
+ new_name = new_name.replace(':', '')
+ new_name = new_name.replace('?', '')
+ new_name = new_name.replace('(', '')
+ new_name = new_name.replace(')', '')
+ new_name = new_name.lower()
+ return new_name
+
+ @staticmethod
+ def build_doc_tree(metadata):
+ flat_tree = dict()
+ for k, v in metadata.items():
+ parent_id = v.get('p_code')
+ if not parent_id:
+ parent_id = 0
+
+ if parent_id not in flat_tree:
+ flat_tree[parent_id] = list()
+ flat_tree[parent_id].append(v)
+ return flat_tree
+
+ @classmethod
+ def get_target_path(cls, code, metadata, path=''):
+ if code in metadata:
+ current = metadata[code]
+ if not current.get('p_code'):
+ return current['new_name']
+ else:
+ return (
+ "{0}/{1}".format(
+ cls.get_target_path(current['p_code'], metadata),
+ current['new_name'])
+ )
+ else:
+ return ''
+
+ def make_label(self, soup, name):
+ label = soup.new_tag("p")
+ label.string = f"..\\_{name.lower()}:"
+ return label
+
+ def is_element_referred(self, ref, fname):
+ return (
+ ref in self.doc_links
+ or '#' + ref in self.doc_links
+ or fname + '#' + ref in self.doc_links
+ )
+
+ def streamline_html(self, soup, file_name):
+ # Drop eventual header duplicated anchors
+ fname = file_name.replace(".html", "").lower()
+ met_page_anchors = dict()
+ for lnk in soup.body.find_all("a"):
+ name = None
+ if "name" in lnk.attrs and lnk.string is None:
+ name = lnk.attrs["name"].lower()
+ if name in met_page_anchors:
+ # Such anchor already existed on this page, drop it
+ lnk.decompose()
+ met_page_anchors[name] = True
+
+ if name and name.lower() == fname:
+ lnk.decompose()
+
+ # Process divs
+ for i in soup.body.find_all('div'):
+ if "note" in i.get('class', []):
+ # Notes
+ del i['id']
+ if i.img:
+ i.img.decompose()
+ notetitle = i.find('span', class_='notetitle')
+ if notetitle:
+ title = soup.new_tag('div')
+ title['class'] = 'title'
+ title.string = 'Note:'
+ notetitle.replace_with(title)
+ elif "notice" in i.get('class', []):
+ # Notices
+ del i['id']
+ if i.img:
+ i.img.decompose()
+ i['class'] = 'important'
+ elif "caution" in i.get('class', []):
+ # Cautions
+ del i['id']
+ if i.img:
+ i.img.decompose()
+ elif "fignone" in i.get('class', []):
+ # Figures
+ # When we found figure generate local label (anchor)
+ if i.get('id'):
+ logging.debug('place figure label')
+ i.insert_before(self.make_label(soup, i.get("id")))
+ figure = soup.new_tag('figure')
+ img = i.find('img')
+ cap = i.find('span', class_='figcap')
+ if cap is not None:
+ cap.name = 'figcaption'
+ figure.append(cap)
+ if img:
+ img['src'] = '/_static/images/' + img['src']
+ figure.append(img)
+ i.replace_with(figure)
+ elif "section" in i.get('class', []):
+ # Sections
+ # When we found section generate local label (anchor)
+ if i.get('id'):
+ sec_id = i.get("id").lower()
+ if self.is_element_referred(sec_id, file_name):
+ logging.debug('Add section label')
+ i.insert_before(self.make_label(soup, sec_id))
+ # and still convert to paragraph
+ i.name = 'p'
+ else:
+ i.name = 'p'
+
+ # Drop strong in table headers "/"
+ for th in soup.body.find_all('th'):
+ if th.p.strong:
+ th.p.strong.unwrap()
+
+ if self.args.improve_table_headers:
+ # Add spaces around "/"
+ for th in soup.body.find_all('th'):
+ if hasattr(th, 'p') and th.p.string:
+ th.p.string = re.sub(
+ r'\b/\b',
+ ' / ',
+ th.p.string)
+
+ # local anchors
+ for lnk in soup.body.find_all("a"):
+ if (
+ lnk.string is None
+ and hasattr(lnk, "name")
+ and not re.match(r"^li\d+$", lnk.attrs["name"])
+ # anywhere section
+ and not re.match(r".*section\d+$", lnk.attrs["name"])
+ # starts with table
+ and not re.match(r"^table\d+$", lnk.attrs["name"])
+ ):
+ # Verify this is really called from somewhere:
+ local_ref = lnk["name"].lower()
+ if self.is_element_referred(local_ref, file_name):
+ # We now know something in the document wants this anchor -
+ # replace it with label
+ lnk.name = "p"
+ lnk.string = f"..\\_{local_ref}:"
+ del lnk["name"]
+ else:
+ logging.debug("Dropping unreferred link")
+
+ for li in soup.body.find_all("li"):
+ del li['id']
+
+ for pre in soup.body.find_all("pre"):
+ text = pre.get_text()
+ # if text.startswith("{"):
+ # pre["class"] = "data"
+ if re.search(
+ r'\[[a-z]*@\w+.*\][\s#>]?',
+ text
+ ):
+ # Something like "[root@ecs-test-0001 ~]#"
+ pre["class"] = "console"
+ elif re.match(
+ r'^(GET|PUT|POST|DELETE)',
+ text
+ ):
+ # Something like "DELETE https://some_url"
+ pre["class"] = "text"
+
+ # And now specialities
+ rawize_strings = [
+ # "\*\*\*\*\*\*",
+ # r"([\\\/\:\*\?\"\~|<>]{4,})"
+ ]
+ for to_rawize in rawize_strings:
+ for p in soup.body.find_all(string=re.compile(to_rawize)):
+ if p.string:
+ curr = p.string
+ part = re.search(to_rawize, curr)
+ if len(part.groups()) > 0:
+ new = curr.replace(
+ part.group(1),
+ f"{part.group(1)}
"
+ )
+ p.replace_with(bs4.BeautifulSoup(new, 'html.parser'))
+ print(part.group(1))
+ print(f"New content is {p.string}")
+ logging.error(f"String with star: {p}")
+
+ return soup.body
+
+ def main(self):
+ logging.basicConfig(level=logging.DEBUG)
+ parser = argparse.ArgumentParser(description='Process links.')
+ parser.add_argument(
+ 'path', type=str, help='path to the files')
+ parser.add_argument(
+ '--improve-table-headers', action='store_true',
+ help='Improve table headers by enforcing spaces around `/`')
+ parser.add_argument(
+ '--pygments-lexer',
+ help='Set particular code-block lexer language')
+ self.args = parser.parse_args()
+ retval = os.getcwd()
+ os.chdir(self.args.path)
+ meta_data = json.loads(open("CLASS.TXT.json").read())
+ metadata_by_uri = dict()
+ metadata_by_code = dict()
+
+ for f in meta_data:
+ f['new_name'] = self.get_new_name(f['title'])
+ metadata_by_uri[f['uri']] = f
+ metadata_by_code[f.get('code')] = f
+
+ tree = self.build_doc_tree(metadata_by_code)
+
+ pathlib.Path("temp/").mkdir(parents=True, exist_ok=True)
+
+ # Scan all docs for anchors
+ for f in pathlib.Path().glob("*.html"):
+ if f.name not in metadata_by_uri:
+ continue
+ # Registering section links
+ with open(f, 'r') as reader:
+ logging.debug(f"Scanning {f.name}")
+ content = reader.read()
+ soup = bs4.BeautifulSoup(content, "lxml")
+ for lnk in soup.body.find_all('a'):
+ if "name" in lnk.attrs and lnk.string is None:
+ anchor = lnk.attrs["name"]
+ title = re.sub('[ _:]', '-', anchor)
+ res = dict(
+ fname=f.name,
+ title=title,
+ replace=title.lower()
+ )
+ self.doc_anchors[anchor] = res
+ if "href" in lnk.attrs and lnk["href"]:
+ self.doc_links[lnk["href"].lower()] = f.name
+
+ for f in pathlib.Path().glob("*.html"):
+ if f.name not in metadata_by_uri:
+ continue
+ _target = metadata_by_uri[f.name]
+ target = _target['new_name']
+ target_path = self.get_target_path(
+ _target['p_code'], metadata_by_code)
+ pathlib.Path("temp/").mkdir(parents=True, exist_ok=True)
+ pathlib.Path("tmp_result/" + target_path).mkdir(
+ parents=True, exist_ok=True)
+ pathlib.Path("result/" + target_path).mkdir(
+ parents=True, exist_ok=True)
+
+ # Pre-processing of html content
+ with open(f, 'r') as reader, \
+ open(f"temp/{target}.tmp", 'w') as writer:
+ # if f.name not in [
+ # "modelarts_21_0031.html",
+ # "en-us_topic_0032380449.html"]:
+ # continue
+ logging.info(f"Pre-Processing {f} as {target}")
+ content = reader.read()
+ soup = bs4.BeautifulSoup(content, "lxml")
+ proc = self.streamline_html(soup, f.name)
+
+ for lnk in proc.find_all("a"):
+ href = lnk.get('href')
+ if href and not href.startswith('http'):
+ # Internal link - replace with :ref:
+ code = soup.new_tag('code')
+ code['class'] = "interpreted-text"
+ code['role'] = "ref"
+ href_parts = href.split('#')
+ if len(href_parts) > 1:
+ # for anchor just use anchor ref
+ link_target = href_parts[1].lower()
+ else:
+ # for other page - use only page name
+ link_target = href_parts[0].replace(
+ ".html", "").lower()
+ if link_target:
+ # Looks like an anchor on the same page
+ code.string = f"{lnk.string} <{link_target}>"
+ logging.debug(f" replace {lnk} with {code}")
+ lnk.replace_with(code)
+
+ # Drop parent link at the bottom of the page
+ for parent in proc.find_all("p", class_="parentlink"):
+ parent.decompose()
+
+ logging.info(f'Saving file {writer.name}')
+ writer.write(str(proc))
+
+ # Convert html to rst
+ os.system(
+ f"pandoc 'temp/{target}.tmp' -f html "
+ f"-o 'tmp_result/{target_path}/{target}.rst' "
+ f"--ascii -s --wrap none"
+ )
+ # Post processing of rendered rst
+ with open(f"tmp_result/{target_path}/{target}.rst", 'r') \
+ as reader, \
+ open(f"result/{target_path}/{target}.rst", 'w') as writer:
+ logging.info(f"Post processing {target}")
+ writer.write(f":original_name: {f.name}\n\n")
+ # Add root file label
+ writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
+ # post process some usual stuff
+ for line in reader.readlines():
+ processed_line = re.sub(r'\.\.\\_', '.. _', line)
+ processed_line = re.sub(r'√', 'Y', processed_line)
+ processed_line = re.sub(
+ r'public_sys-resources/', '', processed_line)
+ processed_line = re.sub(
+ r'image:: ', 'image:: /_static/images/',
+ processed_line)
+ processed_line = re.sub(
+ r' :name: .*$', '', processed_line)
+ processed_line = re.sub(
+ r'\*\*Parent topic:.*$', '', processed_line)
+ processed_line = re.sub(
+ r'.. code:: screen$',
+ r'.. code-block::', processed_line)
+ for lexer in ["json", "bash", "text"]:
+ processed_line = re.sub(
+ f".. code:: {lexer}$",
+ f".. code-block:: {lexer}", processed_line)
+ if re.match(rf".. code:: {lexer}\s", processed_line):
+ logging.error(
+ f"'code-block: {lexer}' with something "
+ "afterwards")
+ exit(1)
+ # spaces are important, since code-block may reside inside
+ # of the cell
+ processed_line = re.sub(
+ r'.. code:: screen\s',
+ r'.. code-block:: ', processed_line)
+ processed_line = re.sub(
+ r'.. code:: codeblock$',
+ r'.. code-block::', processed_line)
+ writer.write(processed_line)
+
+ # Generate indexes
+ for k, v in tree.items():
+ path = ''
+ title = 'Main Index'
+ page_label = ''
+ if k != 0:
+ curr = metadata_by_code[k]
+ title = curr['title']
+ page_label = curr['uri'].replace(".html", "").lower()
+ path = self.get_target_path(curr['code'], metadata_by_code)
+ with open(f"result/{path}/index.rst", "w") as index:
+ if page_label:
+ index.write(f".. _{page_label}:\n\n")
+ index.write('=' * (len(title)) + '\n')
+ index.write(title + '\n')
+ index.write('=' * (len(title)) + '\n')
+ index.write('\n')
+ index.write('.. toctree::\n')
+ index.write(' :maxdepth: 1\n\n')
+ for child in v:
+ new_name = child['new_name']
+ if child['code'] in tree:
+ # If this is folder - add /index
+ new_name = new_name + '/index'
+ index.write(f" {new_name}\n")
+
+ p = pathlib.Path(f"result/{path}.rst")
+ if p.exists():
+ logging.warning(
+ f"{p.resolve()} is removed in favour"
+ f" of result/{path}/index.rst")
+ p.unlink()
+
+ os.chdir(retval)
+
+
+if __name__ == "__main__":
+ OTCDocConvertor().main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..3b9bcf64
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+bs4
+lxml
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..461c968b
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,27 @@
+[metadata]
+name = otc-doc-convertor
+author = Open Telekom Cloud - Ecosystem Squad
+author_email = dl-pbcotcdeleco@t-systems.com
+description = Python program to convert docs exported in HTML into RST
+description_file =
+ README.md
+home_page = https://github.com/opentelekomcloud-docs/doc-exports
+classifier =
+ License :: OSI Approved :: Apache Software License
+ Operating System :: POSIX :: Linux
+ Operating System :: Unix
+ Operating System :: MacOS
+ Programming Language :: Python
+ Programming Language :: Python :: 2
+ Programming Language :: Python :: 2.7
+ Programming Language :: Python :: 3
+ Programming Language :: Python :: 3.6
+ Programming Language :: Python :: 3.7
+keywords = Sphinx, search, python
+
+[options]
+packages = otc_doc_convertor
+
+[options.entry_points]
+console_scripts =
+ otc-convert-doc = otc_doc_convertor.convertor:main
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..f63cc23c
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2013 Hewlett-Packard Development Company, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# THIS FILE IS MANAGED BY THE GLOBAL REQUIREMENTS REPO - DO NOT EDIT
+import setuptools
+
+setuptools.setup(
+ setup_requires=['pbr>=2.0.0'],
+ pbr=True)
diff --git a/test-requirements.txt b/test-requirements.txt
new file mode 100644
index 00000000..39304807
--- /dev/null
+++ b/test-requirements.txt
@@ -0,0 +1 @@
+flake8