#!/usr/bin/env python3
import argparse
import bs4
import json
import logging
import os
import pathlib
import re
import shutil
from jinja2 import FileSystemLoader, Environment, select_autoescape
class OTCDocConvertor:
def __init__(self):
self.doc_anchors = dict()
self.doc_links = dict()
@staticmethod
def get_new_name(current_name):
new_name = current_name.replace(" - ", "_")
# This is a unicode char
new_name = new_name.replace("–", "_")
new_name = new_name.replace(" ", "_")
new_name = new_name.replace("/", "_")
new_name = new_name.replace("=", "_")
new_name = new_name.replace("+", "")
new_name = new_name.replace("'", "")
new_name = new_name.replace('"', "")
new_name = new_name.replace("`", "")
new_name = new_name.replace("´", "")
new_name = new_name.replace(":", "")
new_name = new_name.replace("?", "")
new_name = new_name.replace("(", "")
new_name = new_name.replace(")", "")
new_name = new_name.replace(",", "")
new_name = new_name.replace("!", "")
new_name = new_name.replace("<", "")
new_name = new_name.replace(">", "")
new_name = new_name.replace("$", "")
new_name = new_name.replace("#", "sharp")
new_name = new_name.replace("%", "pct")
new_name = new_name.replace('_&_', '_and_')
new_name = re.sub(r'(\w+)&(\w+)', r'\1_and_\2', new_name)
new_name = re.sub('(_+)', '_', new_name)
new_name = new_name.lower()
return new_name
@staticmethod
def build_doc_tree(metadata):
flat_tree = dict()
for k, v in metadata.items():
parent_id = v.get("p_code")
if not parent_id:
parent_id = 0
if parent_id not in flat_tree:
flat_tree[parent_id] = list()
flat_tree[parent_id].append(v)
return flat_tree
@classmethod
def get_target_path(cls, code, metadata):
if code in metadata:
current = metadata[code]
if not current.get("p_code"):
return current["new_name"]
else:
return "{0}/{1}".format(
cls.get_target_path(current["p_code"], metadata),
current["new_name"],
)
else:
return ""
def make_label(self, soup, name):
label = soup.new_tag("p")
label.string = f"..\\_{name.lower()}:"
return label
def is_element_referred(self, ref, fname):
return (
ref in self.doc_links
or "#" + ref in self.doc_links
or fname.lower() + "#" + ref in self.doc_links
)
def rawize_me(self, soup, expressions):
for to_rawize in expressions:
for p in soup.body.find_all(string=re.compile(to_rawize)):
if p.string and p.parent.name not in [
"b", "strong", "pre", "code"]:
curr = p.string
part = re.search(to_rawize, curr)
# We should not escape inside of bold - this is wrong
if len(part.groups()) > 0:
logging.debug(
"Found element to rawize %s", part.group(1)
)
new = curr.replace(
part.group(1), f"{part.group(1)}
"
)
logging.debug("Replacing string with: %s", new)
p.replace_with(bs4.BeautifulSoup(new, "html.parser"))
logging.debug("Replacing string with: %s", p.string)
else:
logging.error(
"Cannot find string for rawization anymore"
)
def streamline_html(self, soup, file_name, args=None):
# Drop eventual header duplicated anchors
fname = file_name.replace(".html", "").lower()
page_anchors = set()
met_page_anchors = dict()
for lnk in soup.body.find_all("a"):
name = None
if "name" in lnk.attrs and lnk.string is None:
name = lnk.attrs["name"].lower()
if name in met_page_anchors:
# Such anchor already existed on this page, drop it
lnk.decompose()
met_page_anchors[name] = True
if name and name.lower() == fname:
lnk.decompose()
# Process divs
for i in soup.body.find_all("div"):
if i.decomposed:
# if we decompose a later in the code it may still be returned
# here in the list. Skip those
continue
if "note" in i.get("class", []):
# Notes
del i["id"]
if i.img:
i.img.decompose()
notetitle = i.find("span", class_="notetitle")
notebody = i.find(class_="notebody")
if not (notebody and notebody.get_text()):
# Some smart people make empty notes. Since this is
# breaking layout we need to drop those
i.decompose()
elif notetitle:
title = soup.new_tag("div")
title["class"] = "title"
title.string = "Note:"
notetitle.replace_with(title)
elif "warning" in i.get("class", []):
# Warnings
del i["id"]
if i.img:
i.img.decompose()
eltitle = i.find("span", class_="warningtitle")
if eltitle:
title = soup.new_tag("div")
title["class"] = "title"
title.string = "Warning:"
eltitle.replace_with(title)
elif "notice" in i.get("class", []):
# Notices
del i["id"]
if i.img:
i.img.decompose()
i["class"] = "important"
elif "caution" in i.get("class", []):
# Cautions
del i["id"]
if i.img:
i.img.decompose()
elif "fignone" in i.get("class", []):
# Figures
# When we found figure generate local label (anchor)
if i.get("id"):
logging.debug("place figure label")
i.insert_before(self.make_label(soup, i.get("id")))
figure = soup.new_tag("figure")
img = i.find("img")
cap = i.find("span", class_="figcap")
if cap is not None:
cap.name = "figcaption"
figure.append(cap)
if img:
# Store all referred images for copying
self.doc_images.add(img["src"])
img["src"] = (
"/_static/images/"
+ os.path.basename(img["src"])
)
del img["width"]
del img["height"]
del img["class"]
del img["title"]
del img["name"]
del img["id"]
figure.append(img)
i.replace_with(figure)
elif "section" in i.get("class", []):
# Sections
if i.get("id"):
# When we found section generate local label (anchor)
sec_id = i.get("id").lower()
if self.is_element_referred(sec_id, file_name):
page_anchors.add(sec_id)
i.insert_before(self.make_label(soup, sec_id))
i.unwrap()
elif i.get("id") and i.get("id").startswith("body"):
i.unwrap()
else:
i.name = "p"
# Process remaining images
for img in soup.body.find_all("img"):
if img["src"] and not img["src"].startswith("/_static/images"):
self.doc_images.add(img["src"])
img["src"] = "/_static/images/" + os.path.basename(img["src"])
del img["width"]
del img["height"]
del img["class"]
del img["title"]
del img["id"]
# Drop strong in table headers "/"
for th in soup.body.find_all("th"):
if th.p.strong:
th.p.strong.unwrap()
if args and args.improve_table_headers:
# Add spaces around "/"
for th in soup.body.find_all("th"):
if hasattr(th, "p") and th.p.string:
th.p.string = re.sub(r"\b/\b", " / ", th.p.string)
# Drop strong around links "/"
for strong in soup.body.find_all("strong"):
if strong.a:
strong.unwrap()
# table anchors - some tables are referred. Some are having anchor in
# front, some not. In order to cope with that we analyze every table
# and if it is referred - prepend anchor. Next anchor processing will
# skiip it, since such anchor is already placed on the page
for table in soup.body.find_all("table"):
# Verify this is really called from somewhere:
if table.get("id"):
local_ref = table["id"].lower()
if self.is_element_referred(local_ref, file_name):
# We now know something in the document wants this anchor -
# replace it with label
if local_ref not in page_anchors:
lnk = bs4.BeautifulSoup(
f"
..\\_{local_ref}:
", "html.parser" ) table.insert_before(lnk) page_anchors.add(local_ref) else: logging.debug( "Not placing replaced anchor %s " "since it already existed", local_ref, ) # local anchors for lnk in soup.body.find_all("a"): if ( lnk.string is None and lnk.has_attr("name") and not re.match(r"^li\d+$", lnk.attrs["name"]) # anywhere section and not re.match(r".*section\d+$", lnk.attrs["name"]) # starts with table and not re.match(r"^table\d+$", lnk.attrs["name"]) ): # Verify this is really called from somewhere: local_ref = lnk["name"].lower() if self.is_element_referred(local_ref, file_name): # We now know something in the document wants this anchor - # replace it with label if local_ref not in page_anchors: logging.debug("Adding anchor") lnk.name = "p" lnk.string = f"..\\_{local_ref}:" del lnk["name"] page_anchors.add(local_ref) else: logging.debug( "Not placing replaced anchor %s " " since it already existed", local_ref, ) else: logging.debug("Dropping unreferred link %s", lnk) # Undeline element should not be used at all for underline in soup.body.find_all("u"): underline.unwrap() for li in soup.body.find_all("li"): if not li.get("id"): continue local_ref = li.get("id").lower() # Delete li ID to prevent broken RST containers del li["id"] if ( self.is_element_referred(local_ref, file_name) and local_ref not in page_anchors ): # LI item referred, but no anchor present logging.debug("Adding missing li %s anchor", local_ref) li.insert(0, self.make_label(soup, local_ref)) page_anchors.add(local_ref) # Sometimes we have code blocks with line numbers. #1 # 2 # 3 | ....
# {em.string}" em.replace_with(bs4.BeautifulSoup(new, "html.parser")) # Incredibly dirty hacks: rawize_expressions = [ # DWS Dev Guide harcodes r"^(1\|\"iamtext\"\|\"iamvarchar\"\|2006-07-07\|12:00:00)$", r"^(2\|\"iamtext\"\|\"iamvarchar\"\|2022-07-07\|19:00:02)$", r"(\*max_files_per_process\*3)", r"(&&, &&&, .* <#>)", # r"(\*\+)", r"(-\|-)", r"(^-{8}$)" ] self.rawize_me(soup, rawize_expressions) # Special asterisks treatement escape_asterisk_re = r"\((\*)[\.,]" self.rawize_me(soup, [escape_asterisk_re]) # And now remaining specialities rawize_strings = [ # "\*\*\*\*\*\*", # r"([\\\/\:\*\?\"\~|<>]{4,})" # ModelArts UMN contain this "unallowed" sequence r"(\\/:\*\?\"<>\|)", # CSS UMN contain "SELECT exclude('*name') FROM my-index" r"(\*name)", # DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?) r"\(([\W\x60_]{10,})\)", # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+ r"\s([^a-zA-Z0-9\s]{8,})", # MRS operation guide contain: /*+ MAPJOIN(join_table) \*/ r"\s(/\*.*\*/)", # BMS API contain sequence in a dedicated paragraph r"^([^a-zA-Z0-9\s]{10,})$", # OBS special chars - "\$" "\\" etc r"^(\\[\$\\bfnrtvu]{1})$", # CES contains: urn:smn:([a-z]|[A-Z]|[0-9]|\\-){1,32}:.... r"\s(urn:smn:\(.*)\.", # "-" only (in tables) is considered as list r"^(-)$", # MRS component guide has: "./mydate_\\\\d*/" r"\w(_)\\", # Pandoc is not properly escaping _ before special chars what makes # it invalid for Sphinx. A bit weird regex: # "[:space:][:word:][:underscore:][:comma:]" r"\s([\w_]+_)[,]", # DWS dirty-fixes part 2 r"/(\*\+)", ] self.rawize_me(soup, rawize_strings) # Pandoc seem to be not escaping properly asterists which are # immediately following non word chars # (https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#toc-entry-44) # NOTE(gtema): # 1. this is on purpose placed here since we want to have some special # escapings above # 2. we are not escaping asterisks at the end of the paragraphs (pandoc # deals correctly with that) re_escape = re.compile(r"([-:/'\"<\([{])(\*+)(.)") for p in soup.body.find_all(string=re_escape): if p.string and p.parent.name == "p": p.string.replace_with( re.sub(re_escape, r"\1``\2``\3", p.string)) # Drop parent link at the bottom of the page for parent in soup.body.find_all("p", class_="familylinks"): parent.decompose() return soup.body def main(self): logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser(description="Process links.") parser.add_argument("path", type=str, help="path to the files") parser.add_argument( "--improve-table-headers", action="store_true", help="Improve table headers by enforcing spaces around `/`", ) parser.add_argument( "--pygments-lexer", help="Set particular code-block lexer language" ) parser.add_argument( "--dest", help="Directory to write resulting files" ) parser.add_argument("--title", required=True, help="Document title") parser.add_argument( "--service", help="Service to which the document belongs to" ) parser.add_argument("--repo-name", help="Service repository") parser.add_argument("--pdf-name", help="PDF File name") parser.add_argument( "--templates-location", default="templates", help="Location of additional templates", ) self.args = parser.parse_args() if self.args.dest: dest = pathlib.Path(self.args.dest) else: dest = pathlib.Path(self.args.path, "result") dest.mkdir(parents=True, exist_ok=True) metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json") meta_data = dict() if not metadata_file.exists(): logging.warning( f"CLASS.TXT.json file is missing in {self.args.path}, " f"assuming initial import" ) with open(pathlib.Path(dest, "index.rst"), "w") as index: index.write("=" * (len(self.args.title)) + "\n") index.write(self.args.title + "\n") index.write("=" * (len(self.args.title)) + "\n") index.write("\n") else: meta_data = json.loads(open(metadata_file).read()) metadata_by_uri = dict() metadata_by_code = dict() self.doc_images = set() for f in meta_data: f["new_name"] = self.get_new_name(f["title"]) metadata_by_uri[f["uri"]] = f metadata_by_code[f.get("code")] = f tree = self.build_doc_tree(metadata_by_code) pathlib.Path(self.args.path, "temp/").mkdir( parents=True, exist_ok=True ) # Scan all docs for anchors for f in pathlib.Path(self.args.path).glob("*.html"): if f.name not in metadata_by_uri: continue # Registering section links with open(f, "r") as reader: logging.debug(f"Scanning {f.name}") content = reader.read() soup = bs4.BeautifulSoup(content, "lxml") for lnk in soup.body.find_all("a"): if "name" in lnk.attrs and lnk.string is None: anchor = lnk.attrs["name"] title = re.sub("[ _:]", "-", anchor) res = dict( fname=f.name.lower(), title=title, replace=title.lower() ) self.doc_anchors[anchor] = res if "href" in lnk.attrs and lnk["href"]: self.doc_links[lnk["href"].lower()] = f.name for f in pathlib.Path(self.args.path).glob("*.html"): if f.name not in metadata_by_uri: continue _target = metadata_by_uri[f.name] target = _target["new_name"] target_path = self.get_target_path( _target["p_code"], metadata_by_code ) pathlib.Path(self.args.path, "temp").mkdir( parents=True, exist_ok=True ) pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir( parents=True, exist_ok=True ) pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True) # Pre-processing of html content with open(f, "r") as reader, open( pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w" ) as writer: # if f.name not in [ # ]: # continue logging.info(f"Pre-Processing {f} as {target}") content = reader.read() # Preprocess - Fix space inside link and not text # i.e. ` |