From 37b5bf3e95ef0b95c81447129a335bcf5b9af547 Mon Sep 17 00:00:00 2001 From: Artem Goncharov Date: Wed, 27 Apr 2022 18:05:48 +0200 Subject: [PATCH] add base conversion script (#2) add base conversion script Reviewed-by: OpenTelekomCloud Bot --- bindep.txt | 1 + otc_doc_convertor/__init__.py | 0 otc_doc_convertor/convertor.py | 426 +++++++++++++++++++++++++++++++++ otc_doc_convertor/process.py | 409 +++++++++++++++++++++++++++++++ requirements.txt | 2 + setup.cfg | 27 +++ setup.py | 21 ++ test-requirements.txt | 1 + 8 files changed, 887 insertions(+) create mode 100644 bindep.txt create mode 100644 otc_doc_convertor/__init__.py create mode 100644 otc_doc_convertor/convertor.py create mode 100644 otc_doc_convertor/process.py create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 test-requirements.txt diff --git a/bindep.txt b/bindep.txt new file mode 100644 index 00000000..4a59b54c --- /dev/null +++ b/bindep.txt @@ -0,0 +1 @@ +pandoc diff --git a/otc_doc_convertor/__init__.py b/otc_doc_convertor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/otc_doc_convertor/convertor.py b/otc_doc_convertor/convertor.py new file mode 100644 index 00000000..44be6afe --- /dev/null +++ b/otc_doc_convertor/convertor.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 + +import argparse +import bs4 +import json +import logging +import os +import pathlib +import re + + +class OTCDocConvertor: + + def __init__(self): + self.doc_anchors = dict() + self.doc_links = dict() + + @staticmethod + def get_new_name(current_name): + new_name = current_name.replace(' - ', '_') + new_name = new_name.replace(' ', '_') + new_name = new_name.replace('/', '_') + new_name = new_name.replace('\'', '') + new_name = new_name.replace('"', '') + new_name = new_name.replace('`', '') + new_name = new_name.replace('´', '') + new_name = new_name.replace(':', '') + new_name = new_name.replace('?', '') + new_name = new_name.replace('(', '') + new_name = new_name.replace(')', '') + new_name = new_name.lower() + return new_name + + @staticmethod + def build_doc_tree(metadata): + flat_tree = dict() + for k, v in metadata.items(): + parent_id = v.get('p_code') + if not parent_id: + parent_id = 0 + + if parent_id not in flat_tree: + flat_tree[parent_id] = list() + flat_tree[parent_id].append(v) + return flat_tree + + @classmethod + def get_target_path(cls, code, metadata, path=''): + if code in metadata: + current = metadata[code] + if not current.get('p_code'): + return current['new_name'] + else: + return ( + "{0}/{1}".format( + cls.get_target_path(current['p_code'], metadata), + current['new_name']) + ) + else: + return '' + + def make_label(self, soup, name): + label = soup.new_tag("p") + label.string = f"..\\_{name.lower()}:" + return label + + def is_element_referred(self, ref, fname): + return ( + ref in self.doc_links + or '#' + ref in self.doc_links + or fname + '#' + ref in self.doc_links + ) + + def streamline_html(self, soup, file_name): + # Drop eventual header duplicated anchors + fname = file_name.replace(".html", "").lower() + met_page_anchors = dict() + for lnk in soup.body.find_all("a"): + name = None + if "name" in lnk.attrs and lnk.string is None: + name = lnk.attrs["name"].lower() + if name in met_page_anchors: + # Such anchor already existed on this page, drop it + lnk.decompose() + met_page_anchors[name] = True + + if name and name.lower() == fname: + lnk.decompose() + + # Process divs + for i in soup.body.find_all('div'): + if "note" in i.get('class', []): + # Notes + del i['id'] + if i.img: + i.img.decompose() + notetitle = i.find('span', class_='notetitle') + if notetitle: + title = soup.new_tag('div') + title['class'] = 'title' + title.string = 'Note:' + notetitle.replace_with(title) + elif "notice" in i.get('class', []): + # Notices + del i['id'] + if i.img: + i.img.decompose() + i['class'] = 'important' + elif "caution" in i.get('class', []): + # Cautions + del i['id'] + if i.img: + i.img.decompose() + elif "fignone" in i.get('class', []): + # Figures + # When we found figure generate local label (anchor) + if i.get('id'): + logging.debug('place figure label') + i.insert_before(self.make_label(soup, i.get("id"))) + figure = soup.new_tag('figure') + img = i.find('img') + cap = i.find('span', class_='figcap') + if cap is not None: + cap.name = 'figcaption' + figure.append(cap) + if img: + img['src'] = '/_static/images/' + img['src'] + figure.append(img) + i.replace_with(figure) + elif "section" in i.get('class', []): + # Sections + # When we found section generate local label (anchor) + if i.get('id'): + sec_id = i.get("id").lower() + if self.is_element_referred(sec_id, file_name): + logging.debug('Add section label') + i.insert_before(self.make_label(soup, sec_id)) + # and still convert to paragraph + i.name = 'p' + else: + i.name = 'p' + + # Drop strong in table headers "/" + for th in soup.body.find_all('th'): + if th.p.strong: + th.p.strong.unwrap() + + if self.args.improve_table_headers: + # Add spaces around "/" + for th in soup.body.find_all('th'): + if hasattr(th, 'p') and th.p.string: + th.p.string = re.sub( + r'\b/\b', + ' / ', + th.p.string) + + # local anchors + for lnk in soup.body.find_all("a"): + if ( + lnk.string is None + and hasattr(lnk, "name") + and not re.match(r"^li\d+$", lnk.attrs["name"]) + # anywhere section + and not re.match(r".*section\d+$", lnk.attrs["name"]) + # starts with table + and not re.match(r"^table\d+$", lnk.attrs["name"]) + ): + # Verify this is really called from somewhere: + local_ref = lnk["name"].lower() + if self.is_element_referred(local_ref, file_name): + # We now know something in the document wants this anchor - + # replace it with label + lnk.name = "p" + lnk.string = f"..\\_{local_ref}:" + del lnk["name"] + else: + logging.debug("Dropping unreferred link") + + for li in soup.body.find_all("li"): + del li['id'] + + for pre in soup.body.find_all("pre"): + text = pre.get_text() + # if text.startswith("{"): + # pre["class"] = "data" + if re.search( + r'\[[a-z]*@\w+.*\][\s#>]?', + text + ): + # Something like "[root@ecs-test-0001 ~]#" + pre["class"] = "console" + elif re.match( + r'^(GET|PUT|POST|DELETE)', + text + ): + # Something like "DELETE https://some_url" + pre["class"] = "text" + + # And now specialities + rawize_strings = [ + # "\*\*\*\*\*\*", + # r"([\\\/\:\*\?\"\~|<>]{4,})" + ] + for to_rawize in rawize_strings: + for p in soup.body.find_all(string=re.compile(to_rawize)): + if p.string: + curr = p.string + part = re.search(to_rawize, curr) + if len(part.groups()) > 0: + new = curr.replace( + part.group(1), + f"{part.group(1)}" + ) + p.replace_with(bs4.BeautifulSoup(new, 'html.parser')) + print(part.group(1)) + print(f"New content is {p.string}") + logging.error(f"String with star: {p}") + + return soup.body + + def main(self): + logging.basicConfig(level=logging.DEBUG) + parser = argparse.ArgumentParser(description='Process links.') + parser.add_argument( + 'path', type=str, help='path to the files') + parser.add_argument( + '--improve-table-headers', action='store_true', + help='Improve table headers by enforcing spaces around `/`') + parser.add_argument( + '--pygments-lexer', + help='Set particular code-block lexer language') + parser.add_argument( + '--dest', + help='Directory to write resulting files') + self.args = parser.parse_args() + retval = os.getcwd() + meta_data = json.loads(open( + pathlib.Path(self.args.path, "CLASS.TXT.json") + ).read()) + metadata_by_uri = dict() + metadata_by_code = dict() + if self.args.dest: + dest = pathlib.Path(self.args.dest) + else: + dest = pathlib.Path(self.args.path, 'result') + + for f in meta_data: + f['new_name'] = self.get_new_name(f['title']) + metadata_by_uri[f['uri']] = f + metadata_by_code[f.get('code')] = f + + tree = self.build_doc_tree(metadata_by_code) + + pathlib.Path(self.args.path, "temp/").mkdir( + parents=True, exist_ok=True) + + # Scan all docs for anchors + for f in pathlib.Path(self.args.path).glob("*.html"): + if f.name not in metadata_by_uri: + continue + # Registering section links + with open(f, 'r') as reader: + logging.debug(f"Scanning {f.name}") + content = reader.read() + soup = bs4.BeautifulSoup(content, "lxml") + for lnk in soup.body.find_all('a'): + if "name" in lnk.attrs and lnk.string is None: + anchor = lnk.attrs["name"] + title = re.sub('[ _:]', '-', anchor) + res = dict( + fname=f.name, + title=title, + replace=title.lower() + ) + self.doc_anchors[anchor] = res + if "href" in lnk.attrs and lnk["href"]: + self.doc_links[lnk["href"].lower()] = f.name + + for f in pathlib.Path(self.args.path).glob("*.html"): + if f.name not in metadata_by_uri: + continue + _target = metadata_by_uri[f.name] + target = _target['new_name'] + target_path = self.get_target_path( + _target['p_code'], metadata_by_code) + pathlib.Path(self.args.path, "temp").mkdir( + parents=True, exist_ok=True) + pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir( + parents=True, exist_ok=True) + pathlib.Path(dest, target_path).mkdir( + parents=True, exist_ok=True) + + # Pre-processing of html content + with open(f, 'r') as reader, \ + open(pathlib.Path(self.args.path, + f"temp/{target}.tmp"), 'w') as writer: + # if f.name not in [ + # "modelarts_21_0031.html", + # "en-us_topic_0032380449.html"]: + # continue + logging.info(f"Pre-Processing {f} as {target}") + content = reader.read() + soup = bs4.BeautifulSoup(content, "lxml") + proc = self.streamline_html(soup, f.name) + + for lnk in proc.find_all("a"): + href = lnk.get('href') + if href and not href.startswith('http'): + # Internal link - replace with :ref: + code = soup.new_tag('code') + code['class'] = "interpreted-text" + code['role'] = "ref" + href_parts = href.split('#') + if len(href_parts) > 1: + # for anchor just use anchor ref + link_target = href_parts[1].lower() + else: + # for other page - use only page name + link_target = href_parts[0].replace( + ".html", "").lower() + if link_target: + # Looks like an anchor on the same page + code.string = f"{lnk.string} <{link_target}>" + logging.debug(f" replace {lnk} with {code}") + lnk.replace_with(code) + + # Drop parent link at the bottom of the page + for parent in proc.find_all("p", class_="parentlink"): + parent.decompose() + + logging.info(f'Saving file {writer.name}') + writer.write(str(proc)) + + # Convert html to rst + os.system( + f"pandoc '{self.args.path}/temp/{target}.tmp' -f html " + f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' " + f"--ascii -s --wrap none" + ) + # Post processing of rendered rst + with open(f"{self.args.path}/tmp_result/" + f"{target_path}/{target}.rst", 'r') \ + as reader, \ + open(pathlib.Path(dest, target_path, + f"{target}.rst"), 'w') as writer: + logging.info(f"Post processing {target}") + # writer.write(f":original_name: {f.name}\n\n") + # Add root file label + writer.write(f".. _{f.name.replace('.html', '')}:\n\n") + # post process some usual stuff + for line in reader.readlines(): + processed_line = re.sub(r'\.\.\\_', '.. _', line) + processed_line = re.sub(r'√', 'Y', processed_line) + processed_line = re.sub( + r'public_sys-resources/', '', processed_line) + processed_line = re.sub( + r'image:: ', 'image:: /_static/images/', + processed_line) + processed_line = re.sub( + r' :name: .*$', '', processed_line) + processed_line = re.sub( + r'\*\*Parent topic:.*$', '', processed_line) + processed_line = re.sub( + r'.. code:: screen$', + r'.. code-block::', processed_line) + # for lexer in ["json", "bash", "text", "console"]: + # processed_line = re.sub( + # f".. code:: {lexer}$", + # f".. code-block:: {lexer}", processed_line) + # if re.match(rf".. code:: {lexer}\s", processed_line): + # logging.error( + # f"'code-block: {lexer}' with something " + # "afterwards") + # exit(1) + # spaces are important, since code-block may reside inside + # of the cell + processed_line = re.sub( + r'.. code:: screen\s', + r'.. code-block:: ', processed_line) + processed_line = re.sub( + r'.. code:: codeblock$', + r'.. code-block::', processed_line) + writer.write(processed_line) + + # Generate indexes + for k, v in tree.items(): + path = '' + title = 'Main Index' + page_label = '' + if k != 0: + curr = metadata_by_code[k] + title = curr['title'] + page_label = curr['uri'].replace(".html", "").lower() + path = self.get_target_path(curr['code'], metadata_by_code) + with open(pathlib.Path(dest, path, "index.rst"), "w") as index: + if page_label: + index.write(f".. _{page_label}:\n\n") + index.write('=' * (len(title)) + '\n') + index.write(title + '\n') + index.write('=' * (len(title)) + '\n') + index.write('\n') + index.write('.. toctree::\n') + index.write(' :maxdepth: 1\n\n') + for child in v: + new_name = child['new_name'] + if child['code'] in tree: + # If this is folder - add /index + new_name = new_name + '/index' + index.write(f" {new_name}\n") + + p = pathlib.Path(dest, f"{path}.rst") + if p.exists(): + logging.warning( + f"{p.resolve()} is removed in favour" + f" of result/{path}/index.rst") + p.unlink() + + os.chdir(retval) + + +def main(): + OTCDocConvertor().main() + + +if __name__ == "__main__": + main() diff --git a/otc_doc_convertor/process.py b/otc_doc_convertor/process.py new file mode 100644 index 00000000..d2e5e76d --- /dev/null +++ b/otc_doc_convertor/process.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 + +import argparse +import bs4 +import json +import logging +import os +import pathlib +import re + + +class OTCDocConvertor: + + def __init__(self): + self.doc_anchors = dict() + self.doc_links = dict() + + @staticmethod + def get_new_name(current_name): + new_name = current_name.replace(' - ', '_') + new_name = new_name.replace(' ', '_') + new_name = new_name.replace('/', '_') + new_name = new_name.replace('\'', '') + new_name = new_name.replace('"', '') + new_name = new_name.replace('`', '') + new_name = new_name.replace('´', '') + new_name = new_name.replace(':', '') + new_name = new_name.replace('?', '') + new_name = new_name.replace('(', '') + new_name = new_name.replace(')', '') + new_name = new_name.lower() + return new_name + + @staticmethod + def build_doc_tree(metadata): + flat_tree = dict() + for k, v in metadata.items(): + parent_id = v.get('p_code') + if not parent_id: + parent_id = 0 + + if parent_id not in flat_tree: + flat_tree[parent_id] = list() + flat_tree[parent_id].append(v) + return flat_tree + + @classmethod + def get_target_path(cls, code, metadata, path=''): + if code in metadata: + current = metadata[code] + if not current.get('p_code'): + return current['new_name'] + else: + return ( + "{0}/{1}".format( + cls.get_target_path(current['p_code'], metadata), + current['new_name']) + ) + else: + return '' + + def make_label(self, soup, name): + label = soup.new_tag("p") + label.string = f"..\\_{name.lower()}:" + return label + + def is_element_referred(self, ref, fname): + return ( + ref in self.doc_links + or '#' + ref in self.doc_links + or fname + '#' + ref in self.doc_links + ) + + def streamline_html(self, soup, file_name): + # Drop eventual header duplicated anchors + fname = file_name.replace(".html", "").lower() + met_page_anchors = dict() + for lnk in soup.body.find_all("a"): + name = None + if "name" in lnk.attrs and lnk.string is None: + name = lnk.attrs["name"].lower() + if name in met_page_anchors: + # Such anchor already existed on this page, drop it + lnk.decompose() + met_page_anchors[name] = True + + if name and name.lower() == fname: + lnk.decompose() + + # Process divs + for i in soup.body.find_all('div'): + if "note" in i.get('class', []): + # Notes + del i['id'] + if i.img: + i.img.decompose() + notetitle = i.find('span', class_='notetitle') + if notetitle: + title = soup.new_tag('div') + title['class'] = 'title' + title.string = 'Note:' + notetitle.replace_with(title) + elif "notice" in i.get('class', []): + # Notices + del i['id'] + if i.img: + i.img.decompose() + i['class'] = 'important' + elif "caution" in i.get('class', []): + # Cautions + del i['id'] + if i.img: + i.img.decompose() + elif "fignone" in i.get('class', []): + # Figures + # When we found figure generate local label (anchor) + if i.get('id'): + logging.debug('place figure label') + i.insert_before(self.make_label(soup, i.get("id"))) + figure = soup.new_tag('figure') + img = i.find('img') + cap = i.find('span', class_='figcap') + if cap is not None: + cap.name = 'figcaption' + figure.append(cap) + if img: + img['src'] = '/_static/images/' + img['src'] + figure.append(img) + i.replace_with(figure) + elif "section" in i.get('class', []): + # Sections + # When we found section generate local label (anchor) + if i.get('id'): + sec_id = i.get("id").lower() + if self.is_element_referred(sec_id, file_name): + logging.debug('Add section label') + i.insert_before(self.make_label(soup, sec_id)) + # and still convert to paragraph + i.name = 'p' + else: + i.name = 'p' + + # Drop strong in table headers "/" + for th in soup.body.find_all('th'): + if th.p.strong: + th.p.strong.unwrap() + + if self.args.improve_table_headers: + # Add spaces around "/" + for th in soup.body.find_all('th'): + if hasattr(th, 'p') and th.p.string: + th.p.string = re.sub( + r'\b/\b', + ' / ', + th.p.string) + + # local anchors + for lnk in soup.body.find_all("a"): + if ( + lnk.string is None + and hasattr(lnk, "name") + and not re.match(r"^li\d+$", lnk.attrs["name"]) + # anywhere section + and not re.match(r".*section\d+$", lnk.attrs["name"]) + # starts with table + and not re.match(r"^table\d+$", lnk.attrs["name"]) + ): + # Verify this is really called from somewhere: + local_ref = lnk["name"].lower() + if self.is_element_referred(local_ref, file_name): + # We now know something in the document wants this anchor - + # replace it with label + lnk.name = "p" + lnk.string = f"..\\_{local_ref}:" + del lnk["name"] + else: + logging.debug("Dropping unreferred link") + + for li in soup.body.find_all("li"): + del li['id'] + + for pre in soup.body.find_all("pre"): + text = pre.get_text() + # if text.startswith("{"): + # pre["class"] = "data" + if re.search( + r'\[[a-z]*@\w+.*\][\s#>]?', + text + ): + # Something like "[root@ecs-test-0001 ~]#" + pre["class"] = "console" + elif re.match( + r'^(GET|PUT|POST|DELETE)', + text + ): + # Something like "DELETE https://some_url" + pre["class"] = "text" + + # And now specialities + rawize_strings = [ + # "\*\*\*\*\*\*", + # r"([\\\/\:\*\?\"\~|<>]{4,})" + ] + for to_rawize in rawize_strings: + for p in soup.body.find_all(string=re.compile(to_rawize)): + if p.string: + curr = p.string + part = re.search(to_rawize, curr) + if len(part.groups()) > 0: + new = curr.replace( + part.group(1), + f"{part.group(1)}" + ) + p.replace_with(bs4.BeautifulSoup(new, 'html.parser')) + print(part.group(1)) + print(f"New content is {p.string}") + logging.error(f"String with star: {p}") + + return soup.body + + def main(self): + logging.basicConfig(level=logging.DEBUG) + parser = argparse.ArgumentParser(description='Process links.') + parser.add_argument( + 'path', type=str, help='path to the files') + parser.add_argument( + '--improve-table-headers', action='store_true', + help='Improve table headers by enforcing spaces around `/`') + parser.add_argument( + '--pygments-lexer', + help='Set particular code-block lexer language') + self.args = parser.parse_args() + retval = os.getcwd() + os.chdir(self.args.path) + meta_data = json.loads(open("CLASS.TXT.json").read()) + metadata_by_uri = dict() + metadata_by_code = dict() + + for f in meta_data: + f['new_name'] = self.get_new_name(f['title']) + metadata_by_uri[f['uri']] = f + metadata_by_code[f.get('code')] = f + + tree = self.build_doc_tree(metadata_by_code) + + pathlib.Path("temp/").mkdir(parents=True, exist_ok=True) + + # Scan all docs for anchors + for f in pathlib.Path().glob("*.html"): + if f.name not in metadata_by_uri: + continue + # Registering section links + with open(f, 'r') as reader: + logging.debug(f"Scanning {f.name}") + content = reader.read() + soup = bs4.BeautifulSoup(content, "lxml") + for lnk in soup.body.find_all('a'): + if "name" in lnk.attrs and lnk.string is None: + anchor = lnk.attrs["name"] + title = re.sub('[ _:]', '-', anchor) + res = dict( + fname=f.name, + title=title, + replace=title.lower() + ) + self.doc_anchors[anchor] = res + if "href" in lnk.attrs and lnk["href"]: + self.doc_links[lnk["href"].lower()] = f.name + + for f in pathlib.Path().glob("*.html"): + if f.name not in metadata_by_uri: + continue + _target = metadata_by_uri[f.name] + target = _target['new_name'] + target_path = self.get_target_path( + _target['p_code'], metadata_by_code) + pathlib.Path("temp/").mkdir(parents=True, exist_ok=True) + pathlib.Path("tmp_result/" + target_path).mkdir( + parents=True, exist_ok=True) + pathlib.Path("result/" + target_path).mkdir( + parents=True, exist_ok=True) + + # Pre-processing of html content + with open(f, 'r') as reader, \ + open(f"temp/{target}.tmp", 'w') as writer: + # if f.name not in [ + # "modelarts_21_0031.html", + # "en-us_topic_0032380449.html"]: + # continue + logging.info(f"Pre-Processing {f} as {target}") + content = reader.read() + soup = bs4.BeautifulSoup(content, "lxml") + proc = self.streamline_html(soup, f.name) + + for lnk in proc.find_all("a"): + href = lnk.get('href') + if href and not href.startswith('http'): + # Internal link - replace with :ref: + code = soup.new_tag('code') + code['class'] = "interpreted-text" + code['role'] = "ref" + href_parts = href.split('#') + if len(href_parts) > 1: + # for anchor just use anchor ref + link_target = href_parts[1].lower() + else: + # for other page - use only page name + link_target = href_parts[0].replace( + ".html", "").lower() + if link_target: + # Looks like an anchor on the same page + code.string = f"{lnk.string} <{link_target}>" + logging.debug(f" replace {lnk} with {code}") + lnk.replace_with(code) + + # Drop parent link at the bottom of the page + for parent in proc.find_all("p", class_="parentlink"): + parent.decompose() + + logging.info(f'Saving file {writer.name}') + writer.write(str(proc)) + + # Convert html to rst + os.system( + f"pandoc 'temp/{target}.tmp' -f html " + f"-o 'tmp_result/{target_path}/{target}.rst' " + f"--ascii -s --wrap none" + ) + # Post processing of rendered rst + with open(f"tmp_result/{target_path}/{target}.rst", 'r') \ + as reader, \ + open(f"result/{target_path}/{target}.rst", 'w') as writer: + logging.info(f"Post processing {target}") + writer.write(f":original_name: {f.name}\n\n") + # Add root file label + writer.write(f".. _{f.name.replace('.html', '')}:\n\n") + # post process some usual stuff + for line in reader.readlines(): + processed_line = re.sub(r'\.\.\\_', '.. _', line) + processed_line = re.sub(r'√', 'Y', processed_line) + processed_line = re.sub( + r'public_sys-resources/', '', processed_line) + processed_line = re.sub( + r'image:: ', 'image:: /_static/images/', + processed_line) + processed_line = re.sub( + r' :name: .*$', '', processed_line) + processed_line = re.sub( + r'\*\*Parent topic:.*$', '', processed_line) + processed_line = re.sub( + r'.. code:: screen$', + r'.. code-block::', processed_line) + for lexer in ["json", "bash", "text"]: + processed_line = re.sub( + f".. code:: {lexer}$", + f".. code-block:: {lexer}", processed_line) + if re.match(rf".. code:: {lexer}\s", processed_line): + logging.error( + f"'code-block: {lexer}' with something " + "afterwards") + exit(1) + # spaces are important, since code-block may reside inside + # of the cell + processed_line = re.sub( + r'.. code:: screen\s', + r'.. code-block:: ', processed_line) + processed_line = re.sub( + r'.. code:: codeblock$', + r'.. code-block::', processed_line) + writer.write(processed_line) + + # Generate indexes + for k, v in tree.items(): + path = '' + title = 'Main Index' + page_label = '' + if k != 0: + curr = metadata_by_code[k] + title = curr['title'] + page_label = curr['uri'].replace(".html", "").lower() + path = self.get_target_path(curr['code'], metadata_by_code) + with open(f"result/{path}/index.rst", "w") as index: + if page_label: + index.write(f".. _{page_label}:\n\n") + index.write('=' * (len(title)) + '\n') + index.write(title + '\n') + index.write('=' * (len(title)) + '\n') + index.write('\n') + index.write('.. toctree::\n') + index.write(' :maxdepth: 1\n\n') + for child in v: + new_name = child['new_name'] + if child['code'] in tree: + # If this is folder - add /index + new_name = new_name + '/index' + index.write(f" {new_name}\n") + + p = pathlib.Path(f"result/{path}.rst") + if p.exists(): + logging.warning( + f"{p.resolve()} is removed in favour" + f" of result/{path}/index.rst") + p.unlink() + + os.chdir(retval) + + +if __name__ == "__main__": + OTCDocConvertor().main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..3b9bcf64 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +bs4 +lxml diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..461c968b --- /dev/null +++ b/setup.cfg @@ -0,0 +1,27 @@ +[metadata] +name = otc-doc-convertor +author = Open Telekom Cloud - Ecosystem Squad +author_email = dl-pbcotcdeleco@t-systems.com +description = Python program to convert docs exported in HTML into RST +description_file = + README.md +home_page = https://github.com/opentelekomcloud-docs/doc-exports +classifier = + License :: OSI Approved :: Apache Software License + Operating System :: POSIX :: Linux + Operating System :: Unix + Operating System :: MacOS + Programming Language :: Python + Programming Language :: Python :: 2 + Programming Language :: Python :: 2.7 + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 +keywords = Sphinx, search, python + +[options] +packages = otc_doc_convertor + +[options.entry_points] +console_scripts = + otc-convert-doc = otc_doc_convertor.convertor:main diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..f63cc23c --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +# Copyright (c) 2013 Hewlett-Packard Development Company, L.P. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# THIS FILE IS MANAGED BY THE GLOBAL REQUIREMENTS REPO - DO NOT EDIT +import setuptools + +setuptools.setup( + setup_requires=['pbr>=2.0.0'], + pbr=True) diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 00000000..39304807 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1 @@ +flake8