forked from laiweijian4/doc-exports
Reviewed-by: gtema <artem.goncharov@gmail.com> Co-authored-by: Hasko, Vladimir <vladimir.hasko@t-systems.com> Co-committed-by: Hasko, Vladimir <vladimir.hasko@t-systems.com>
778 lines
32 KiB
Python
778 lines
32 KiB
Python
#!/usr/bin/env python3
|
||
|
||
import argparse
|
||
import bs4
|
||
import json
|
||
import logging
|
||
import os
|
||
import pathlib
|
||
import re
|
||
import shutil
|
||
|
||
from jinja2 import FileSystemLoader, Environment, select_autoescape
|
||
|
||
|
||
class OTCDocConvertor:
|
||
def __init__(self):
|
||
self.doc_anchors = dict()
|
||
self.doc_links = dict()
|
||
|
||
@staticmethod
|
||
def get_new_name(current_name):
|
||
new_name = current_name.replace(" - ", "_")
|
||
# This is a unicode char
|
||
new_name = new_name.replace("–", "_")
|
||
new_name = new_name.replace(" ", "_")
|
||
new_name = new_name.replace("/", "_")
|
||
new_name = new_name.replace("=", "_")
|
||
new_name = new_name.replace("+", "")
|
||
new_name = new_name.replace("'", "")
|
||
new_name = new_name.replace('"', "")
|
||
new_name = new_name.replace("`", "")
|
||
new_name = new_name.replace("´", "")
|
||
new_name = new_name.replace(":", "")
|
||
new_name = new_name.replace("?", "")
|
||
new_name = new_name.replace("(", "")
|
||
new_name = new_name.replace(")", "")
|
||
new_name = new_name.replace(",", "")
|
||
new_name = new_name.replace("!", "")
|
||
new_name = new_name.replace("<", "")
|
||
new_name = new_name.replace(">", "")
|
||
new_name = new_name.replace("$", "")
|
||
new_name = new_name.replace("#", "sharp")
|
||
new_name = new_name.replace("%", "pct")
|
||
new_name = new_name.replace('_&_', '_and_')
|
||
new_name = re.sub(r'(\w+)&(\w+)', r'\1_and_\2', new_name)
|
||
new_name = re.sub('(_+)', '_', new_name)
|
||
new_name = new_name.lower()
|
||
return new_name
|
||
|
||
@staticmethod
|
||
def build_doc_tree(metadata):
|
||
flat_tree = dict()
|
||
for k, v in metadata.items():
|
||
parent_id = v.get("p_code")
|
||
if not parent_id:
|
||
parent_id = 0
|
||
|
||
if parent_id not in flat_tree:
|
||
flat_tree[parent_id] = list()
|
||
flat_tree[parent_id].append(v)
|
||
return flat_tree
|
||
|
||
@classmethod
|
||
def get_target_path(cls, code, metadata):
|
||
if code in metadata:
|
||
current = metadata[code]
|
||
if not current.get("p_code"):
|
||
return current["new_name"]
|
||
else:
|
||
return "{0}/{1}".format(
|
||
cls.get_target_path(current["p_code"], metadata),
|
||
current["new_name"],
|
||
)
|
||
else:
|
||
return ""
|
||
|
||
def make_label(self, soup, name):
|
||
label = soup.new_tag("p")
|
||
label.string = f"..\\_{name.lower()}:"
|
||
return label
|
||
|
||
def is_element_referred(self, ref, fname):
|
||
return (
|
||
ref in self.doc_links
|
||
or "#" + ref in self.doc_links
|
||
or fname.lower() + "#" + ref in self.doc_links
|
||
)
|
||
|
||
def rawize_me(self, soup, expressions):
|
||
for to_rawize in expressions:
|
||
for p in soup.body.find_all(string=re.compile(to_rawize)):
|
||
if p.string and p.parent.name not in [
|
||
"b", "strong", "pre", "code"]:
|
||
curr = p.string
|
||
part = re.search(to_rawize, curr)
|
||
# We should not escape inside of bold - this is wrong
|
||
if len(part.groups()) > 0:
|
||
logging.debug(
|
||
"Found element to rawize %s", part.group(1)
|
||
)
|
||
new = curr.replace(
|
||
part.group(1), f"<code>{part.group(1)}</code>"
|
||
)
|
||
logging.debug("Replacing string with: %s", new)
|
||
p.replace_with(bs4.BeautifulSoup(new, "html.parser"))
|
||
logging.debug("Replacing string with: %s", p.string)
|
||
else:
|
||
logging.error(
|
||
"Cannot find string for rawization anymore"
|
||
)
|
||
|
||
def streamline_html(self, soup, file_name, args=None):
|
||
# Drop eventual header duplicated anchors
|
||
fname = file_name.replace(".html", "").lower()
|
||
page_anchors = set()
|
||
met_page_anchors = dict()
|
||
for lnk in soup.body.find_all("a"):
|
||
name = None
|
||
if "name" in lnk.attrs and lnk.string is None:
|
||
name = lnk.attrs["name"].lower()
|
||
if name in met_page_anchors:
|
||
# Such anchor already existed on this page, drop it
|
||
lnk.decompose()
|
||
met_page_anchors[name] = True
|
||
|
||
if name and name.lower() == fname:
|
||
lnk.decompose()
|
||
|
||
# Process divs
|
||
for i in soup.body.find_all("div"):
|
||
if i.decomposed:
|
||
# if we decompose a later in the code it may still be returned
|
||
# here in the list. Skip those
|
||
continue
|
||
if "note" in i.get("class", []):
|
||
# Notes
|
||
del i["id"]
|
||
if i.img:
|
||
i.img.decompose()
|
||
notetitle = i.find("span", class_="notetitle")
|
||
notebody = i.find(class_="notebody")
|
||
if not (notebody and notebody.get_text()):
|
||
# Some smart people make empty notes. Since this is
|
||
# breaking layout we need to drop those
|
||
i.decompose()
|
||
elif notetitle:
|
||
title = soup.new_tag("div")
|
||
title["class"] = "title"
|
||
title.string = "Note:"
|
||
notetitle.replace_with(title)
|
||
elif "warning" in i.get("class", []):
|
||
# Warnings
|
||
del i["id"]
|
||
if i.img:
|
||
i.img.decompose()
|
||
eltitle = i.find("span", class_="warningtitle")
|
||
if eltitle:
|
||
title = soup.new_tag("div")
|
||
title["class"] = "title"
|
||
title.string = "Warning:"
|
||
eltitle.replace_with(title)
|
||
elif "notice" in i.get("class", []):
|
||
# Notices
|
||
del i["id"]
|
||
if i.img:
|
||
i.img.decompose()
|
||
i["class"] = "important"
|
||
elif "caution" in i.get("class", []):
|
||
# Cautions
|
||
del i["id"]
|
||
if i.img:
|
||
i.img.decompose()
|
||
elif "fignone" in i.get("class", []):
|
||
# Figures
|
||
# When we found figure generate local label (anchor)
|
||
if i.get("id"):
|
||
logging.debug("place figure label")
|
||
i.insert_before(self.make_label(soup, i.get("id")))
|
||
figure = soup.new_tag("figure")
|
||
img = i.find("img")
|
||
cap = i.find("span", class_="figcap")
|
||
if cap is not None:
|
||
cap.name = "figcaption"
|
||
figure.append(cap)
|
||
if img:
|
||
# Store all referred images for copying
|
||
self.doc_images.add(img["src"])
|
||
img["src"] = (
|
||
"/_static/images/"
|
||
+ os.path.basename(img["src"])
|
||
)
|
||
del img["width"]
|
||
del img["height"]
|
||
del img["class"]
|
||
del img["title"]
|
||
del img["name"]
|
||
del img["id"]
|
||
figure.append(img)
|
||
i.replace_with(figure)
|
||
elif "section" in i.get("class", []):
|
||
# Sections
|
||
if i.get("id"):
|
||
# When we found section generate local label (anchor)
|
||
sec_id = i.get("id").lower()
|
||
if self.is_element_referred(sec_id, file_name):
|
||
page_anchors.add(sec_id)
|
||
i.insert_before(self.make_label(soup, sec_id))
|
||
i.unwrap()
|
||
elif i.get("id") and i.get("id").startswith("body"):
|
||
i.unwrap()
|
||
else:
|
||
i.name = "p"
|
||
|
||
# Process remaining images
|
||
for img in soup.body.find_all("img"):
|
||
if img["src"] and not img["src"].startswith("/_static/images"):
|
||
self.doc_images.add(img["src"])
|
||
img["src"] = "/_static/images/" + os.path.basename(img["src"])
|
||
del img["width"]
|
||
del img["height"]
|
||
del img["class"]
|
||
del img["title"]
|
||
del img["id"]
|
||
|
||
# Drop strong in table headers "/"
|
||
for th in soup.body.find_all("th"):
|
||
if th.p and th.p.strong:
|
||
th.p.strong.unwrap()
|
||
|
||
if args and args.improve_table_headers:
|
||
# Add spaces around "/"
|
||
for th in soup.body.find_all("th"):
|
||
if hasattr(th, "p") and th.p.string:
|
||
th.p.string = re.sub(r"\b/\b", " / ", th.p.string)
|
||
|
||
# Drop strong around links "/"
|
||
for strong in soup.body.find_all("strong"):
|
||
if strong.a:
|
||
strong.unwrap()
|
||
|
||
# table anchors - some tables are referred. Some are having anchor in
|
||
# front, some not. In order to cope with that we analyze every table
|
||
# and if it is referred - prepend anchor. Next anchor processing will
|
||
# skiip it, since such anchor is already placed on the page
|
||
for table in soup.body.find_all("table"):
|
||
# Verify this is really called from somewhere:
|
||
if table.get("id"):
|
||
local_ref = table["id"].lower()
|
||
if self.is_element_referred(local_ref, file_name):
|
||
# We now know something in the document wants this anchor -
|
||
# replace it with label
|
||
if local_ref not in page_anchors:
|
||
lnk = bs4.BeautifulSoup(
|
||
f"<p>..\\_{local_ref}:</p>", "html.parser"
|
||
)
|
||
table.insert_before(lnk)
|
||
page_anchors.add(local_ref)
|
||
else:
|
||
logging.debug(
|
||
"Not placing replaced anchor %s "
|
||
"since it already existed",
|
||
local_ref,
|
||
)
|
||
|
||
# local anchors
|
||
for lnk in soup.body.find_all("a"):
|
||
if (
|
||
lnk.string is None
|
||
and lnk.has_attr("name")
|
||
and not re.match(r"^li\d+$", lnk.attrs["name"])
|
||
# anywhere section
|
||
and not re.match(r".*section\d+$", lnk.attrs["name"])
|
||
# starts with table
|
||
and not re.match(r"^table\d+$", lnk.attrs["name"])
|
||
):
|
||
# Verify this is really called from somewhere:
|
||
local_ref = lnk["name"].lower()
|
||
if self.is_element_referred(local_ref, file_name):
|
||
# We now know something in the document wants this anchor -
|
||
# replace it with label
|
||
if local_ref not in page_anchors:
|
||
logging.debug("Adding anchor")
|
||
lnk.name = "p"
|
||
lnk.string = f"..\\_{local_ref}:"
|
||
del lnk["name"]
|
||
page_anchors.add(local_ref)
|
||
else:
|
||
logging.debug(
|
||
"Not placing replaced anchor %s "
|
||
" since it already existed",
|
||
local_ref,
|
||
)
|
||
else:
|
||
logging.debug("Dropping unreferred link %s", lnk)
|
||
|
||
# Undeline element should not be used at all
|
||
for underline in soup.body.find_all("u"):
|
||
underline.unwrap()
|
||
|
||
for li in soup.body.find_all("li"):
|
||
if not li.get("id"):
|
||
continue
|
||
local_ref = li.get("id").lower()
|
||
# Delete li ID to prevent broken RST containers
|
||
del li["id"]
|
||
|
||
if (
|
||
self.is_element_referred(local_ref, file_name)
|
||
and local_ref not in page_anchors
|
||
):
|
||
# LI item referred, but no anchor present
|
||
logging.debug("Adding missing li %s anchor", local_ref)
|
||
li.insert(0, self.make_label(soup, local_ref))
|
||
page_anchors.add(local_ref)
|
||
|
||
# Sometimes we have code blocks with line numbers.
|
||
# <div class="codecoloring" codetype="xxx"><table class="xxx">
|
||
# <tr><td class="linenos"><div class="linenodiv"><pre>1
|
||
# 2
|
||
# 3</pre></div></td><td class="code"><div class="highlight"><pre>....
|
||
# </pre></div>
|
||
for td_lines in soup.body.find_all("td", "linenos"):
|
||
codeblock = td_lines.parent.find("td", "code")
|
||
table = td_lines.find_parent("table")
|
||
if codeblock and table:
|
||
# Replace whole table with only codeblock td
|
||
logging.debug("Replace %s with %s" % (table, codeblock))
|
||
codeblock.name = "pre"
|
||
del codeblock["class"]
|
||
table.replace_with(codeblock)
|
||
|
||
for pre in soup.body.find_all("pre"):
|
||
text = pre.get_text()
|
||
# if text.startswith("{"):
|
||
# pre["class"] = "data"
|
||
if re.search(r"\[[a-z]*@\w+.*\][\s#>]?", text):
|
||
# Something like "[root@ecs-test-0001 ~]#"
|
||
pre["class"] = "console"
|
||
elif re.match(r"^(GET|PUT|POST|DELETE)", text):
|
||
# Something like "DELETE https://some_url"
|
||
pre["class"] = "text"
|
||
if "codeblock" in pre.get("class", []):
|
||
# <pre class="codeblock"
|
||
pre["class"] = "text"
|
||
|
||
# Another hack: sometimes bold elements ends with space. This is not
|
||
# valid from rst pov and we need to fix that by dropping this space and
|
||
# inject a space after strong block
|
||
for el in soup.body.find_all(["strong", "b"]):
|
||
if (
|
||
el.string
|
||
and el.string[-1] == " "
|
||
):
|
||
curr = el.string
|
||
el.string.replace_with(curr[:-1])
|
||
el.insert_after(" ")
|
||
elif (
|
||
el.span
|
||
and el.span.string and el.span.string == " "
|
||
):
|
||
el.span.decompose()
|
||
|
||
# A very dirty hack - if we have "*" inside of em (italic) regular
|
||
# escaping is not working anymore (pandoc is not producing proper
|
||
# output). We can only try to put whole italic content as a literal
|
||
# block to avoid corruption.
|
||
for em in soup.body.find_all("em"):
|
||
if (
|
||
em.string
|
||
and em.string.find("*") != -1
|
||
and em.parent.name not in ["b", "strong"]
|
||
):
|
||
new = f"<code>{em.string}</code>"
|
||
em.replace_with(bs4.BeautifulSoup(new, "html.parser"))
|
||
|
||
# Incredibly dirty hacks:
|
||
rawize_expressions = [
|
||
# DWS Dev Guide harcodes
|
||
r"^(1\|\"iamtext\"\|\"iamvarchar\"\|2006-07-07\|12:00:00)$",
|
||
r"^(2\|\"iamtext\"\|\"iamvarchar\"\|2022-07-07\|19:00:02)$",
|
||
r"(\*max_files_per_process\*3)",
|
||
r"(&&, &&&, .* <#>)",
|
||
# r"(\*\+)",
|
||
r"(-\|-)",
|
||
r"(^-{8}$)"
|
||
]
|
||
self.rawize_me(soup, rawize_expressions)
|
||
|
||
# Special asterisks treatement
|
||
escape_asterisk_re = r"\((\*)[\.,]"
|
||
self.rawize_me(soup, [escape_asterisk_re])
|
||
|
||
# And now remaining specialities
|
||
rawize_strings = [
|
||
# "\*\*\*\*\*\*",
|
||
# r"([\\\/\:\*\?\"\~|<>]{4,})"
|
||
# ModelArts UMN contain this "unallowed" sequence
|
||
r"(\\/:\*\?\"<>\|)",
|
||
# CSS UMN contain "SELECT exclude('*name') FROM my-index"
|
||
r"(\*name)",
|
||
# DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?)
|
||
r"\(([\W\x60_]{10,})\)",
|
||
# MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
|
||
r"\s([^a-zA-Z0-9\s]{8,})",
|
||
# MRS operation guide contain: /*+ MAPJOIN(join_table) \*/
|
||
# RDS UMN contain: /*FORCE_MASTER*/
|
||
r"(/\*.{5,}\*/)",
|
||
# BMS API contain sequence in a dedicated paragraph
|
||
r"^([^a-zA-Z0-9\s]{10,})$",
|
||
# OBS special chars - "\$" "\\" etc
|
||
r"^(\\[\$\\bfnrtvu]{1})$",
|
||
# CES contains: urn:smn:([a-z]|[A-Z]|[0-9]|\\-){1,32}:....
|
||
r"\s(urn:smn:\(.*)\.",
|
||
# "-" only (in tables) is considered as list
|
||
r"^(-)$",
|
||
# MRS component guide has: "./mydate_\\\\d*/"
|
||
r"\w(_)\\",
|
||
# Pandoc is not properly escaping _ before special chars what makes
|
||
# it invalid for Sphinx. A bit weird regex:
|
||
# "[:space:][:word:][:underscore:][:comma:]"
|
||
r"\s([\w_]+_)[,]",
|
||
# DWS dirty-fixes part 2
|
||
r"/(\*\+)",
|
||
]
|
||
self.rawize_me(soup, rawize_strings)
|
||
|
||
# Pandoc seem to be not escaping properly asterisks which are
|
||
# immediately following non word chars
|
||
# (https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#toc-entry-44)
|
||
# NOTE(gtema):
|
||
# 1. this is on purpose placed here since we want to have some special
|
||
# escapings above
|
||
# 2. we are not escaping asterisks at the end of the paragraphs (pandoc
|
||
# deals correctly with that)
|
||
re_escape = re.compile(r"([-/'\"<\([{])(\*+)(.)")
|
||
for p in soup.body.find_all(string=re_escape):
|
||
if p.string and p.parent.name == "p":
|
||
p.string.replace_with(
|
||
re.sub(re_escape, r"\1``\2``\3", p.string))
|
||
|
||
# Special case for multiple asterisks and colons like ecs:*:*
|
||
re_escape = re.compile(r"([:])(\*+)")
|
||
re_escape_new = re.compile(r"([:])(\*)[^$]")
|
||
for p in soup.body.find_all(string=re_escape):
|
||
if p.string and (p.parent.name == "p" or p.parent.name == "li"):
|
||
string = p.string
|
||
while re.search(re_escape, string):
|
||
if re.search(re_escape_new, string):
|
||
string = re.sub(
|
||
re_escape, r"\1``\2``", string, count=1)
|
||
else:
|
||
break
|
||
p.string.replace_with(string)
|
||
|
||
# Drop parent link at the bottom of the page
|
||
for parent in soup.body.find_all("p", class_="familylinks"):
|
||
parent.decompose()
|
||
|
||
return soup.body
|
||
|
||
def main(self):
|
||
logging.basicConfig(level=logging.DEBUG)
|
||
parser = argparse.ArgumentParser(description="Process links.")
|
||
parser.add_argument("path", type=str, help="path to the files")
|
||
parser.add_argument(
|
||
"--improve-table-headers",
|
||
action="store_true",
|
||
help="Improve table headers by enforcing spaces around `/`",
|
||
)
|
||
parser.add_argument(
|
||
"--pygments-lexer", help="Set particular code-block lexer language"
|
||
)
|
||
parser.add_argument(
|
||
"--dest", help="Directory to write resulting files"
|
||
)
|
||
parser.add_argument("--title", required=True, help="Document title")
|
||
parser.add_argument(
|
||
"--service", help="Service to which the document belongs to"
|
||
)
|
||
parser.add_argument("--repo-name", help="Service repository")
|
||
parser.add_argument("--pdf-name", help="PDF File name")
|
||
parser.add_argument(
|
||
"--templates-location",
|
||
default="templates",
|
||
help="Location of additional templates",
|
||
)
|
||
self.args = parser.parse_args()
|
||
if self.args.dest:
|
||
dest = pathlib.Path(self.args.dest)
|
||
else:
|
||
dest = pathlib.Path(self.args.path, "result")
|
||
dest.mkdir(parents=True, exist_ok=True)
|
||
|
||
metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json")
|
||
meta_data = dict()
|
||
|
||
if not metadata_file.exists():
|
||
logging.warning(
|
||
f"CLASS.TXT.json file is missing in {self.args.path}, "
|
||
f"assuming initial import"
|
||
)
|
||
with open(pathlib.Path(dest, "index.rst"), "w") as index:
|
||
index.write("=" * (len(self.args.title)) + "\n")
|
||
index.write(self.args.title + "\n")
|
||
index.write("=" * (len(self.args.title)) + "\n")
|
||
index.write("\n")
|
||
else:
|
||
meta_data = json.loads(open(metadata_file).read())
|
||
metadata_by_uri = dict()
|
||
metadata_by_code = dict()
|
||
self.doc_images = set()
|
||
for f in meta_data:
|
||
f["new_name"] = self.get_new_name(f["title"])
|
||
metadata_by_uri[f["uri"]] = f
|
||
metadata_by_code[f.get("code")] = f
|
||
|
||
tree = self.build_doc_tree(metadata_by_code)
|
||
|
||
pathlib.Path(self.args.path, "temp/").mkdir(
|
||
parents=True, exist_ok=True
|
||
)
|
||
|
||
# Scan all docs for anchors
|
||
for f in pathlib.Path(self.args.path).glob("*.html"):
|
||
if f.name not in metadata_by_uri:
|
||
continue
|
||
# Registering section links
|
||
with open(f, "r") as reader:
|
||
logging.debug(f"Scanning {f.name}")
|
||
content = reader.read()
|
||
soup = bs4.BeautifulSoup(content, "lxml")
|
||
for lnk in soup.body.find_all("a"):
|
||
if "name" in lnk.attrs and lnk.string is None:
|
||
anchor = lnk.attrs["name"]
|
||
title = re.sub("[ _:]", "-", anchor)
|
||
res = dict(
|
||
fname=f.name.lower(),
|
||
title=title,
|
||
replace=title.lower()
|
||
)
|
||
self.doc_anchors[anchor] = res
|
||
if "href" in lnk.attrs and lnk["href"]:
|
||
self.doc_links[lnk["href"].lower()] = f.name
|
||
|
||
for f in pathlib.Path(self.args.path).glob("*.html"):
|
||
if f.name not in metadata_by_uri:
|
||
continue
|
||
_target = metadata_by_uri[f.name]
|
||
target = _target["new_name"]
|
||
target_path = self.get_target_path(
|
||
_target["p_code"], metadata_by_code
|
||
)
|
||
pathlib.Path(self.args.path, "temp").mkdir(
|
||
parents=True, exist_ok=True
|
||
)
|
||
pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
|
||
parents=True, exist_ok=True
|
||
)
|
||
pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True)
|
||
|
||
# Pre-processing of html content
|
||
with open(f, "r") as reader, open(
|
||
pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w"
|
||
) as writer:
|
||
# if f.name not in [
|
||
# ]:
|
||
# continue
|
||
logging.info(f"Pre-Processing {f} as {target}")
|
||
content = reader.read()
|
||
# Preprocess - Fix space inside link and not text
|
||
# i.e. `<p>Some <a>link </a>in text</p>
|
||
content = re.sub(r"\s</a>", "</a> ", content)
|
||
content = re.sub(r"√", "Y", content)
|
||
content = re.sub(r"×", "x", content)
|
||
content = re.sub(r"π", "Pi", content)
|
||
content = re.sub(r"–", "-", content)
|
||
content = re.sub(r"≤", "<=", content)
|
||
content = re.sub(r"≥", ">=", content)
|
||
soup = bs4.BeautifulSoup(content, "lxml")
|
||
proc = self.streamline_html(soup, f.name, self.args)
|
||
|
||
for lnk in proc.find_all("a"):
|
||
href = lnk.get("href")
|
||
if href and not href.startswith("http"):
|
||
# Internal link - replace with :ref:
|
||
code = soup.new_tag("code")
|
||
code["class"] = "interpreted-text"
|
||
code["role"] = "ref"
|
||
href_parts = href.split("#")
|
||
if len(href_parts) > 1:
|
||
# for anchor just use anchor ref
|
||
link_target = href_parts[1].lower()
|
||
else:
|
||
# for other page - use only page name
|
||
link_target = (
|
||
href_parts[0].replace(".html", "").lower()
|
||
)
|
||
if link_target:
|
||
# Looks like an anchor on the same page
|
||
code.string = f"{lnk.string} <{link_target}>"
|
||
logging.debug(f" replace {lnk} with {code}")
|
||
lnk.replace_with(code)
|
||
|
||
logging.info(f"Saving file {writer.name}")
|
||
writer.write(str(proc))
|
||
|
||
# Convert html to rst
|
||
os.system(
|
||
f"pandoc '{self.args.path}/temp/{target}.tmp' -f html "
|
||
f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' "
|
||
f"--ascii -s --wrap none"
|
||
)
|
||
# Post processing of rendered rst
|
||
with open(
|
||
f"{self.args.path}/tmp_result/" f"{target_path}/{target}.rst",
|
||
"r",
|
||
) as reader, open(
|
||
pathlib.Path(dest, target_path, f"{target}.rst"), "w"
|
||
) as writer:
|
||
logging.info(f"Post processing {target}...")
|
||
writer.write(f":original_name: {f.name}\n\n")
|
||
# Add root file label
|
||
writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
|
||
# post process some usual stuff
|
||
for line in reader.readlines():
|
||
processed_line = re.sub(
|
||
r"\.\.\\\\_(.*):$", r".. _\1:", line)
|
||
# replace anchor when it is itself inside some other block
|
||
# (i.e. table)
|
||
processed_line = re.sub(
|
||
r"\.\.\\\\_(.*):\s", r".. _\1: ", processed_line)
|
||
# For some reason regex locally and in zuul are not
|
||
# behaving same - thus same but different
|
||
processed_line = re.sub(
|
||
r"\.\.\\_(.*):$", r".. _\1:", processed_line)
|
||
# replace anchor when it is itself inside some other block
|
||
# (i.e. table)
|
||
processed_line = re.sub(
|
||
r"\.\.\\_(.*):\s", r".. _\1: ", processed_line)
|
||
# We could get unwanted anchors from pandoc - get rid of
|
||
# them
|
||
anchor = re.search(r"\.\. \_(.*):", processed_line)
|
||
if anchor and len(anchor.groups()) > 0:
|
||
if not self.is_element_referred(
|
||
anchor.group(1), f.name
|
||
):
|
||
# This is most likely some duplicated anchor. It is
|
||
# not referred from any other place so drop it
|
||
logging.info(
|
||
"Dropping not referred anchor '%s'",
|
||
anchor.group(1))
|
||
continue
|
||
|
||
processed_line = re.sub(
|
||
r"public_sys-resources/", "", processed_line
|
||
)
|
||
processed_line = re.sub(
|
||
r" :name: .*$", "", processed_line
|
||
)
|
||
processed_line = re.sub(
|
||
r"\*\*Parent topic:.*$", "", processed_line
|
||
)
|
||
processed_line = re.sub(
|
||
r".. code:: screen$",
|
||
r".. code-block::",
|
||
processed_line,
|
||
)
|
||
for lexer in ["json", "bash", "text", "console"]:
|
||
processed_line = re.sub(
|
||
f".. code:: {lexer}$",
|
||
f".. code-block:: {lexer}",
|
||
processed_line,
|
||
)
|
||
if re.match(rf".. code:: {lexer}\s", processed_line):
|
||
logging.error(
|
||
f"'code-block: {lexer}' with something "
|
||
"afterwards"
|
||
)
|
||
exit(1)
|
||
# spaces are important, since code-block may reside inside
|
||
# of the cell
|
||
processed_line = re.sub(
|
||
r".. code:: screen\s",
|
||
r".. code-block:: ",
|
||
processed_line,
|
||
)
|
||
processed_line = re.sub(
|
||
r".. code:: codeblock$",
|
||
r".. code-block::",
|
||
processed_line,
|
||
)
|
||
processed_line = re.sub(r"[ \t]*$", "", processed_line)
|
||
writer.write(processed_line)
|
||
|
||
# Generate indexes
|
||
for k, v in tree.items():
|
||
path = ""
|
||
title = self.args.title
|
||
page_label = ""
|
||
if k != 0:
|
||
curr = metadata_by_code[k]
|
||
title = curr["title"]
|
||
page_label = curr["uri"].replace(".html", "").lower()
|
||
path = self.get_target_path(curr["code"], metadata_by_code)
|
||
|
||
p = pathlib.Path(dest, f"{path}.rst")
|
||
if p.exists():
|
||
logging.warning(
|
||
f"{p.resolve()} is renamed into {path}/index.rst"
|
||
)
|
||
# Update existing index file
|
||
p.rename(pathlib.Path(dest, f"{path}/index.rst"))
|
||
with open(pathlib.Path(dest, path, "index.rst"), "a") as index:
|
||
index.write("\n")
|
||
index.write(".. toctree::\n")
|
||
index.write(" :maxdepth: 1\n")
|
||
index.write(" :hidden: \n\n")
|
||
for child in v:
|
||
new_name = child["new_name"]
|
||
if child["code"] in tree:
|
||
# If this is folder - add /index
|
||
new_name = new_name + "/index"
|
||
index.write(f" {new_name}\n")
|
||
else:
|
||
with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
|
||
# New index file
|
||
if page_label:
|
||
index.write(f".. _{page_label}:\n\n")
|
||
index.write("=" * (len(title)) + "\n")
|
||
index.write(title + "\n")
|
||
index.write("=" * (len(title)) + "\n")
|
||
index.write("\n")
|
||
index.write(".. toctree::\n")
|
||
index.write(" :maxdepth: 1\n\n")
|
||
for child in v:
|
||
new_name = child["new_name"]
|
||
if child["code"] in tree:
|
||
# If this is folder - add /index
|
||
new_name = new_name + "/index"
|
||
index.write(f" {new_name}\n")
|
||
# Copy used images
|
||
if len(self.doc_images) > 0:
|
||
logging.debug("Processing images...")
|
||
img_dest = pathlib.Path(dest, "_static", "images")
|
||
img_dest.mkdir(parents=True, exist_ok=True)
|
||
for img in self.doc_images:
|
||
shutil.copyfile(
|
||
pathlib.Path(self.args.path, img).resolve(strict=False),
|
||
pathlib.Path(img_dest, os.path.basename(img)).resolve(
|
||
strict=False
|
||
),
|
||
)
|
||
|
||
context = dict(
|
||
title=self.args.title,
|
||
project=self.args.service,
|
||
repo_name=self.args.repo_name,
|
||
pdf_name=self.args.pdf_name,
|
||
)
|
||
loader = FileSystemLoader([self.args.templates_location])
|
||
env = Environment(loader=loader, autoescape=select_autoescape())
|
||
for f in loader.list_templates():
|
||
outfile_tmpl = env.get_template(f)
|
||
outfile_rendered = outfile_tmpl.render(**context)
|
||
target_file = pathlib.Path(self.args.dest, f)
|
||
target_file.parent.mkdir(parents=True, exist_ok=True)
|
||
with open(target_file, "w", encoding="utf-8", newline="") as out:
|
||
logging.debug(f"Generating {f} from template...")
|
||
out.write(outfile_rendered)
|
||
|
||
|
||
def main():
|
||
OTCDocConvertor().main()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|