Hasko, Vladimir caf6a13046 adding th fix for matches without strong element
Reviewed-by: gtema <artem.goncharov@gmail.com>
Co-authored-by: Hasko, Vladimir <vladimir.hasko@t-systems.com>
Co-committed-by: Hasko, Vladimir <vladimir.hasko@t-systems.com>
2023-04-19 10:28:18 +00:00

778 lines
32 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import argparse
import bs4
import json
import logging
import os
import pathlib
import re
import shutil
from jinja2 import FileSystemLoader, Environment, select_autoescape
class OTCDocConvertor:
def __init__(self):
self.doc_anchors = dict()
self.doc_links = dict()
@staticmethod
def get_new_name(current_name):
new_name = current_name.replace(" - ", "_")
# This is a unicode char
new_name = new_name.replace("", "_")
new_name = new_name.replace(" ", "_")
new_name = new_name.replace("/", "_")
new_name = new_name.replace("=", "_")
new_name = new_name.replace("+", "")
new_name = new_name.replace("'", "")
new_name = new_name.replace('"', "")
new_name = new_name.replace("`", "")
new_name = new_name.replace("´", "")
new_name = new_name.replace(":", "")
new_name = new_name.replace("?", "")
new_name = new_name.replace("(", "")
new_name = new_name.replace(")", "")
new_name = new_name.replace(",", "")
new_name = new_name.replace("!", "")
new_name = new_name.replace("<", "")
new_name = new_name.replace(">", "")
new_name = new_name.replace("$", "")
new_name = new_name.replace("#", "sharp")
new_name = new_name.replace("%", "pct")
new_name = new_name.replace('_&_', '_and_')
new_name = re.sub(r'(\w+)&(\w+)', r'\1_and_\2', new_name)
new_name = re.sub('(_+)', '_', new_name)
new_name = new_name.lower()
return new_name
@staticmethod
def build_doc_tree(metadata):
flat_tree = dict()
for k, v in metadata.items():
parent_id = v.get("p_code")
if not parent_id:
parent_id = 0
if parent_id not in flat_tree:
flat_tree[parent_id] = list()
flat_tree[parent_id].append(v)
return flat_tree
@classmethod
def get_target_path(cls, code, metadata):
if code in metadata:
current = metadata[code]
if not current.get("p_code"):
return current["new_name"]
else:
return "{0}/{1}".format(
cls.get_target_path(current["p_code"], metadata),
current["new_name"],
)
else:
return ""
def make_label(self, soup, name):
label = soup.new_tag("p")
label.string = f"..\\_{name.lower()}:"
return label
def is_element_referred(self, ref, fname):
return (
ref in self.doc_links
or "#" + ref in self.doc_links
or fname.lower() + "#" + ref in self.doc_links
)
def rawize_me(self, soup, expressions):
for to_rawize in expressions:
for p in soup.body.find_all(string=re.compile(to_rawize)):
if p.string and p.parent.name not in [
"b", "strong", "pre", "code"]:
curr = p.string
part = re.search(to_rawize, curr)
# We should not escape inside of bold - this is wrong
if len(part.groups()) > 0:
logging.debug(
"Found element to rawize %s", part.group(1)
)
new = curr.replace(
part.group(1), f"<code>{part.group(1)}</code>"
)
logging.debug("Replacing string with: %s", new)
p.replace_with(bs4.BeautifulSoup(new, "html.parser"))
logging.debug("Replacing string with: %s", p.string)
else:
logging.error(
"Cannot find string for rawization anymore"
)
def streamline_html(self, soup, file_name, args=None):
# Drop eventual header duplicated anchors
fname = file_name.replace(".html", "").lower()
page_anchors = set()
met_page_anchors = dict()
for lnk in soup.body.find_all("a"):
name = None
if "name" in lnk.attrs and lnk.string is None:
name = lnk.attrs["name"].lower()
if name in met_page_anchors:
# Such anchor already existed on this page, drop it
lnk.decompose()
met_page_anchors[name] = True
if name and name.lower() == fname:
lnk.decompose()
# Process divs
for i in soup.body.find_all("div"):
if i.decomposed:
# if we decompose a later in the code it may still be returned
# here in the list. Skip those
continue
if "note" in i.get("class", []):
# Notes
del i["id"]
if i.img:
i.img.decompose()
notetitle = i.find("span", class_="notetitle")
notebody = i.find(class_="notebody")
if not (notebody and notebody.get_text()):
# Some smart people make empty notes. Since this is
# breaking layout we need to drop those
i.decompose()
elif notetitle:
title = soup.new_tag("div")
title["class"] = "title"
title.string = "Note:"
notetitle.replace_with(title)
elif "warning" in i.get("class", []):
# Warnings
del i["id"]
if i.img:
i.img.decompose()
eltitle = i.find("span", class_="warningtitle")
if eltitle:
title = soup.new_tag("div")
title["class"] = "title"
title.string = "Warning:"
eltitle.replace_with(title)
elif "notice" in i.get("class", []):
# Notices
del i["id"]
if i.img:
i.img.decompose()
i["class"] = "important"
elif "caution" in i.get("class", []):
# Cautions
del i["id"]
if i.img:
i.img.decompose()
elif "fignone" in i.get("class", []):
# Figures
# When we found figure generate local label (anchor)
if i.get("id"):
logging.debug("place figure label")
i.insert_before(self.make_label(soup, i.get("id")))
figure = soup.new_tag("figure")
img = i.find("img")
cap = i.find("span", class_="figcap")
if cap is not None:
cap.name = "figcaption"
figure.append(cap)
if img:
# Store all referred images for copying
self.doc_images.add(img["src"])
img["src"] = (
"/_static/images/"
+ os.path.basename(img["src"])
)
del img["width"]
del img["height"]
del img["class"]
del img["title"]
del img["name"]
del img["id"]
figure.append(img)
i.replace_with(figure)
elif "section" in i.get("class", []):
# Sections
if i.get("id"):
# When we found section generate local label (anchor)
sec_id = i.get("id").lower()
if self.is_element_referred(sec_id, file_name):
page_anchors.add(sec_id)
i.insert_before(self.make_label(soup, sec_id))
i.unwrap()
elif i.get("id") and i.get("id").startswith("body"):
i.unwrap()
else:
i.name = "p"
# Process remaining images
for img in soup.body.find_all("img"):
if img["src"] and not img["src"].startswith("/_static/images"):
self.doc_images.add(img["src"])
img["src"] = "/_static/images/" + os.path.basename(img["src"])
del img["width"]
del img["height"]
del img["class"]
del img["title"]
del img["id"]
# Drop strong in table headers "/"
for th in soup.body.find_all("th"):
if th.p and th.p.strong:
th.p.strong.unwrap()
if args and args.improve_table_headers:
# Add spaces around "/"
for th in soup.body.find_all("th"):
if hasattr(th, "p") and th.p.string:
th.p.string = re.sub(r"\b/\b", " / ", th.p.string)
# Drop strong around links "/"
for strong in soup.body.find_all("strong"):
if strong.a:
strong.unwrap()
# table anchors - some tables are referred. Some are having anchor in
# front, some not. In order to cope with that we analyze every table
# and if it is referred - prepend anchor. Next anchor processing will
# skiip it, since such anchor is already placed on the page
for table in soup.body.find_all("table"):
# Verify this is really called from somewhere:
if table.get("id"):
local_ref = table["id"].lower()
if self.is_element_referred(local_ref, file_name):
# We now know something in the document wants this anchor -
# replace it with label
if local_ref not in page_anchors:
lnk = bs4.BeautifulSoup(
f"<p>..\\_{local_ref}:</p>", "html.parser"
)
table.insert_before(lnk)
page_anchors.add(local_ref)
else:
logging.debug(
"Not placing replaced anchor %s "
"since it already existed",
local_ref,
)
# local anchors
for lnk in soup.body.find_all("a"):
if (
lnk.string is None
and lnk.has_attr("name")
and not re.match(r"^li\d+$", lnk.attrs["name"])
# anywhere section
and not re.match(r".*section\d+$", lnk.attrs["name"])
# starts with table
and not re.match(r"^table\d+$", lnk.attrs["name"])
):
# Verify this is really called from somewhere:
local_ref = lnk["name"].lower()
if self.is_element_referred(local_ref, file_name):
# We now know something in the document wants this anchor -
# replace it with label
if local_ref not in page_anchors:
logging.debug("Adding anchor")
lnk.name = "p"
lnk.string = f"..\\_{local_ref}:"
del lnk["name"]
page_anchors.add(local_ref)
else:
logging.debug(
"Not placing replaced anchor %s "
" since it already existed",
local_ref,
)
else:
logging.debug("Dropping unreferred link %s", lnk)
# Undeline element should not be used at all
for underline in soup.body.find_all("u"):
underline.unwrap()
for li in soup.body.find_all("li"):
if not li.get("id"):
continue
local_ref = li.get("id").lower()
# Delete li ID to prevent broken RST containers
del li["id"]
if (
self.is_element_referred(local_ref, file_name)
and local_ref not in page_anchors
):
# LI item referred, but no anchor present
logging.debug("Adding missing li %s anchor", local_ref)
li.insert(0, self.make_label(soup, local_ref))
page_anchors.add(local_ref)
# Sometimes we have code blocks with line numbers.
# <div class="codecoloring" codetype="xxx"><table class="xxx">
# <tr><td class="linenos"><div class="linenodiv"><pre>1
# 2
# 3</pre></div></td><td class="code"><div class="highlight"><pre>....
# </pre></div>
for td_lines in soup.body.find_all("td", "linenos"):
codeblock = td_lines.parent.find("td", "code")
table = td_lines.find_parent("table")
if codeblock and table:
# Replace whole table with only codeblock td
logging.debug("Replace %s with %s" % (table, codeblock))
codeblock.name = "pre"
del codeblock["class"]
table.replace_with(codeblock)
for pre in soup.body.find_all("pre"):
text = pre.get_text()
# if text.startswith("{"):
# pre["class"] = "data"
if re.search(r"\[[a-z]*@\w+.*\][\s#>]?", text):
# Something like "[root@ecs-test-0001 ~]#"
pre["class"] = "console"
elif re.match(r"^(GET|PUT|POST|DELETE)", text):
# Something like "DELETE https://some_url"
pre["class"] = "text"
if "codeblock" in pre.get("class", []):
# <pre class="codeblock"
pre["class"] = "text"
# Another hack: sometimes bold elements ends with space. This is not
# valid from rst pov and we need to fix that by dropping this space and
# inject a space after strong block
for el in soup.body.find_all(["strong", "b"]):
if (
el.string
and el.string[-1] == " "
):
curr = el.string
el.string.replace_with(curr[:-1])
el.insert_after(" ")
elif (
el.span
and el.span.string and el.span.string == " "
):
el.span.decompose()
# A very dirty hack - if we have "*" inside of em (italic) regular
# escaping is not working anymore (pandoc is not producing proper
# output). We can only try to put whole italic content as a literal
# block to avoid corruption.
for em in soup.body.find_all("em"):
if (
em.string
and em.string.find("*") != -1
and em.parent.name not in ["b", "strong"]
):
new = f"<code>{em.string}</code>"
em.replace_with(bs4.BeautifulSoup(new, "html.parser"))
# Incredibly dirty hacks:
rawize_expressions = [
# DWS Dev Guide harcodes
r"^(1\|\"iamtext\"\|\"iamvarchar\"\|2006-07-07\|12:00:00)$",
r"^(2\|\"iamtext\"\|\"iamvarchar\"\|2022-07-07\|19:00:02)$",
r"(\*max_files_per_process\*3)",
r"(&&, &&&, .* <#>)",
# r"(\*\+)",
r"(-\|-)",
r"(^-{8}$)"
]
self.rawize_me(soup, rawize_expressions)
# Special asterisks treatement
escape_asterisk_re = r"\((\*)[\.,]"
self.rawize_me(soup, [escape_asterisk_re])
# And now remaining specialities
rawize_strings = [
# "\*\*\*\*\*\*",
# r"([\\\/\:\*\?\"\~|<>]{4,})"
# ModelArts UMN contain this "unallowed" sequence
r"(\\/:\*\?\"<>\|)",
# CSS UMN contain "SELECT exclude('*name') FROM my-index"
r"(\*name)",
# DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?)
r"\(([\W\x60_]{10,})\)",
# MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
r"\s([^a-zA-Z0-9\s]{8,})",
# MRS operation guide contain: /*+ MAPJOIN(join_table) \*/
# RDS UMN contain: /*FORCE_MASTER*/
r"(/\*.{5,}\*/)",
# BMS API contain sequence in a dedicated paragraph
r"^([^a-zA-Z0-9\s]{10,})$",
# OBS special chars - "\$" "\\" etc
r"^(\\[\$\\bfnrtvu]{1})$",
# CES contains: urn:smn:([a-z]|[A-Z]|[0-9]|\\-){1,32}:....
r"\s(urn:smn:\(.*)\.",
# "-" only (in tables) is considered as list
r"^(-)$",
# MRS component guide has: "./mydate_\\\\d*/"
r"\w(_)\\",
# Pandoc is not properly escaping _ before special chars what makes
# it invalid for Sphinx. A bit weird regex:
# "[:space:][:word:][:underscore:][:comma:]"
r"\s([\w_]+_)[,]",
# DWS dirty-fixes part 2
r"/(\*\+)",
]
self.rawize_me(soup, rawize_strings)
# Pandoc seem to be not escaping properly asterisks which are
# immediately following non word chars
# (https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#toc-entry-44)
# NOTE(gtema):
# 1. this is on purpose placed here since we want to have some special
# escapings above
# 2. we are not escaping asterisks at the end of the paragraphs (pandoc
# deals correctly with that)
re_escape = re.compile(r"([-/'\"<\([{])(\*+)(.)")
for p in soup.body.find_all(string=re_escape):
if p.string and p.parent.name == "p":
p.string.replace_with(
re.sub(re_escape, r"\1``\2``\3", p.string))
# Special case for multiple asterisks and colons like ecs:*:*
re_escape = re.compile(r"([:])(\*+)")
re_escape_new = re.compile(r"([:])(\*)[^$]")
for p in soup.body.find_all(string=re_escape):
if p.string and (p.parent.name == "p" or p.parent.name == "li"):
string = p.string
while re.search(re_escape, string):
if re.search(re_escape_new, string):
string = re.sub(
re_escape, r"\1``\2``", string, count=1)
else:
break
p.string.replace_with(string)
# Drop parent link at the bottom of the page
for parent in soup.body.find_all("p", class_="familylinks"):
parent.decompose()
return soup.body
def main(self):
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(description="Process links.")
parser.add_argument("path", type=str, help="path to the files")
parser.add_argument(
"--improve-table-headers",
action="store_true",
help="Improve table headers by enforcing spaces around `/`",
)
parser.add_argument(
"--pygments-lexer", help="Set particular code-block lexer language"
)
parser.add_argument(
"--dest", help="Directory to write resulting files"
)
parser.add_argument("--title", required=True, help="Document title")
parser.add_argument(
"--service", help="Service to which the document belongs to"
)
parser.add_argument("--repo-name", help="Service repository")
parser.add_argument("--pdf-name", help="PDF File name")
parser.add_argument(
"--templates-location",
default="templates",
help="Location of additional templates",
)
self.args = parser.parse_args()
if self.args.dest:
dest = pathlib.Path(self.args.dest)
else:
dest = pathlib.Path(self.args.path, "result")
dest.mkdir(parents=True, exist_ok=True)
metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json")
meta_data = dict()
if not metadata_file.exists():
logging.warning(
f"CLASS.TXT.json file is missing in {self.args.path}, "
f"assuming initial import"
)
with open(pathlib.Path(dest, "index.rst"), "w") as index:
index.write("=" * (len(self.args.title)) + "\n")
index.write(self.args.title + "\n")
index.write("=" * (len(self.args.title)) + "\n")
index.write("\n")
else:
meta_data = json.loads(open(metadata_file).read())
metadata_by_uri = dict()
metadata_by_code = dict()
self.doc_images = set()
for f in meta_data:
f["new_name"] = self.get_new_name(f["title"])
metadata_by_uri[f["uri"]] = f
metadata_by_code[f.get("code")] = f
tree = self.build_doc_tree(metadata_by_code)
pathlib.Path(self.args.path, "temp/").mkdir(
parents=True, exist_ok=True
)
# Scan all docs for anchors
for f in pathlib.Path(self.args.path).glob("*.html"):
if f.name not in metadata_by_uri:
continue
# Registering section links
with open(f, "r") as reader:
logging.debug(f"Scanning {f.name}")
content = reader.read()
soup = bs4.BeautifulSoup(content, "lxml")
for lnk in soup.body.find_all("a"):
if "name" in lnk.attrs and lnk.string is None:
anchor = lnk.attrs["name"]
title = re.sub("[ _:]", "-", anchor)
res = dict(
fname=f.name.lower(),
title=title,
replace=title.lower()
)
self.doc_anchors[anchor] = res
if "href" in lnk.attrs and lnk["href"]:
self.doc_links[lnk["href"].lower()] = f.name
for f in pathlib.Path(self.args.path).glob("*.html"):
if f.name not in metadata_by_uri:
continue
_target = metadata_by_uri[f.name]
target = _target["new_name"]
target_path = self.get_target_path(
_target["p_code"], metadata_by_code
)
pathlib.Path(self.args.path, "temp").mkdir(
parents=True, exist_ok=True
)
pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
parents=True, exist_ok=True
)
pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True)
# Pre-processing of html content
with open(f, "r") as reader, open(
pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w"
) as writer:
# if f.name not in [
# ]:
# continue
logging.info(f"Pre-Processing {f} as {target}")
content = reader.read()
# Preprocess - Fix space inside link and not text
# i.e. `<p>Some <a>link </a>in text</p>
content = re.sub(r"\s</a>", "</a> ", content)
content = re.sub(r"", "Y", content)
content = re.sub(r"×", "x", content)
content = re.sub(r"π", "Pi", content)
content = re.sub(r"", "-", content)
content = re.sub(r"", "<=", content)
content = re.sub(r"", ">=", content)
soup = bs4.BeautifulSoup(content, "lxml")
proc = self.streamline_html(soup, f.name, self.args)
for lnk in proc.find_all("a"):
href = lnk.get("href")
if href and not href.startswith("http"):
# Internal link - replace with :ref:
code = soup.new_tag("code")
code["class"] = "interpreted-text"
code["role"] = "ref"
href_parts = href.split("#")
if len(href_parts) > 1:
# for anchor just use anchor ref
link_target = href_parts[1].lower()
else:
# for other page - use only page name
link_target = (
href_parts[0].replace(".html", "").lower()
)
if link_target:
# Looks like an anchor on the same page
code.string = f"{lnk.string} <{link_target}>"
logging.debug(f" replace {lnk} with {code}")
lnk.replace_with(code)
logging.info(f"Saving file {writer.name}")
writer.write(str(proc))
# Convert html to rst
os.system(
f"pandoc '{self.args.path}/temp/{target}.tmp' -f html "
f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' "
f"--ascii -s --wrap none"
)
# Post processing of rendered rst
with open(
f"{self.args.path}/tmp_result/" f"{target_path}/{target}.rst",
"r",
) as reader, open(
pathlib.Path(dest, target_path, f"{target}.rst"), "w"
) as writer:
logging.info(f"Post processing {target}...")
writer.write(f":original_name: {f.name}\n\n")
# Add root file label
writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
# post process some usual stuff
for line in reader.readlines():
processed_line = re.sub(
r"\.\.\\\\_(.*):$", r".. _\1:", line)
# replace anchor when it is itself inside some other block
# (i.e. table)
processed_line = re.sub(
r"\.\.\\\\_(.*):\s", r".. _\1: ", processed_line)
# For some reason regex locally and in zuul are not
# behaving same - thus same but different
processed_line = re.sub(
r"\.\.\\_(.*):$", r".. _\1:", processed_line)
# replace anchor when it is itself inside some other block
# (i.e. table)
processed_line = re.sub(
r"\.\.\\_(.*):\s", r".. _\1: ", processed_line)
# We could get unwanted anchors from pandoc - get rid of
# them
anchor = re.search(r"\.\. \_(.*):", processed_line)
if anchor and len(anchor.groups()) > 0:
if not self.is_element_referred(
anchor.group(1), f.name
):
# This is most likely some duplicated anchor. It is
# not referred from any other place so drop it
logging.info(
"Dropping not referred anchor '%s'",
anchor.group(1))
continue
processed_line = re.sub(
r"public_sys-resources/", "", processed_line
)
processed_line = re.sub(
r" :name: .*$", "", processed_line
)
processed_line = re.sub(
r"\*\*Parent topic:.*$", "", processed_line
)
processed_line = re.sub(
r".. code:: screen$",
r".. code-block::",
processed_line,
)
for lexer in ["json", "bash", "text", "console"]:
processed_line = re.sub(
f".. code:: {lexer}$",
f".. code-block:: {lexer}",
processed_line,
)
if re.match(rf".. code:: {lexer}\s", processed_line):
logging.error(
f"'code-block: {lexer}' with something "
"afterwards"
)
exit(1)
# spaces are important, since code-block may reside inside
# of the cell
processed_line = re.sub(
r".. code:: screen\s",
r".. code-block:: ",
processed_line,
)
processed_line = re.sub(
r".. code:: codeblock$",
r".. code-block::",
processed_line,
)
processed_line = re.sub(r"[ \t]*$", "", processed_line)
writer.write(processed_line)
# Generate indexes
for k, v in tree.items():
path = ""
title = self.args.title
page_label = ""
if k != 0:
curr = metadata_by_code[k]
title = curr["title"]
page_label = curr["uri"].replace(".html", "").lower()
path = self.get_target_path(curr["code"], metadata_by_code)
p = pathlib.Path(dest, f"{path}.rst")
if p.exists():
logging.warning(
f"{p.resolve()} is renamed into {path}/index.rst"
)
# Update existing index file
p.rename(pathlib.Path(dest, f"{path}/index.rst"))
with open(pathlib.Path(dest, path, "index.rst"), "a") as index:
index.write("\n")
index.write(".. toctree::\n")
index.write(" :maxdepth: 1\n")
index.write(" :hidden: \n\n")
for child in v:
new_name = child["new_name"]
if child["code"] in tree:
# If this is folder - add /index
new_name = new_name + "/index"
index.write(f" {new_name}\n")
else:
with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
# New index file
if page_label:
index.write(f".. _{page_label}:\n\n")
index.write("=" * (len(title)) + "\n")
index.write(title + "\n")
index.write("=" * (len(title)) + "\n")
index.write("\n")
index.write(".. toctree::\n")
index.write(" :maxdepth: 1\n\n")
for child in v:
new_name = child["new_name"]
if child["code"] in tree:
# If this is folder - add /index
new_name = new_name + "/index"
index.write(f" {new_name}\n")
# Copy used images
if len(self.doc_images) > 0:
logging.debug("Processing images...")
img_dest = pathlib.Path(dest, "_static", "images")
img_dest.mkdir(parents=True, exist_ok=True)
for img in self.doc_images:
shutil.copyfile(
pathlib.Path(self.args.path, img).resolve(strict=False),
pathlib.Path(img_dest, os.path.basename(img)).resolve(
strict=False
),
)
context = dict(
title=self.args.title,
project=self.args.service,
repo_name=self.args.repo_name,
pdf_name=self.args.pdf_name,
)
loader = FileSystemLoader([self.args.templates_location])
env = Environment(loader=loader, autoescape=select_autoescape())
for f in loader.list_templates():
outfile_tmpl = env.get_template(f)
outfile_rendered = outfile_tmpl.render(**context)
target_file = pathlib.Path(self.args.dest, f)
target_file.parent.mkdir(parents=True, exist_ok=True)
with open(target_file, "w", encoding="utf-8", newline="") as out:
logging.debug(f"Generating {f} from template...")
out.write(outfile_rendered)
def main():
OTCDocConvertor().main()
if __name__ == "__main__":
main()