gtema 9777c6d162 adapt conversion script
Reviewed-by: Hasko, Vladimir <vladimir.hasko@t-systems.com>
Co-authored-by: gtema <artem.goncharov@gmail.com>
Co-committed-by: gtema <artem.goncharov@gmail.com>
2022-09-07 10:23:19 +00:00

560 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import argparse
import bs4
import json
import logging
import os
import pathlib
import re
import shutil
from jinja2 import FileSystemLoader, Environment, select_autoescape
class OTCDocConvertor:
def __init__(self):
self.doc_anchors = dict()
self.doc_links = dict()
@staticmethod
def get_new_name(current_name):
new_name = current_name.replace(' - ', '_')
new_name = new_name.replace(' ', '_')
new_name = new_name.replace('/', '_')
new_name = new_name.replace('\'', '')
new_name = new_name.replace('"', '')
new_name = new_name.replace('`', '')
new_name = new_name.replace('´', '')
new_name = new_name.replace(':', '')
new_name = new_name.replace('?', '')
new_name = new_name.replace('(', '')
new_name = new_name.replace(')', '')
new_name = new_name.lower()
return new_name
@staticmethod
def build_doc_tree(metadata):
flat_tree = dict()
for k, v in metadata.items():
parent_id = v.get('p_code')
if not parent_id:
parent_id = 0
if parent_id not in flat_tree:
flat_tree[parent_id] = list()
flat_tree[parent_id].append(v)
return flat_tree
@classmethod
def get_target_path(cls, code, metadata, path=''):
if code in metadata:
current = metadata[code]
if not current.get('p_code'):
return current['new_name']
else:
return (
"{0}/{1}".format(
cls.get_target_path(current['p_code'], metadata),
current['new_name'])
)
else:
return ''
def make_label(self, soup, name):
label = soup.new_tag("p")
label.string = f"..\\_{name.lower()}:"
return label
def is_element_referred(self, ref, fname):
return (
ref in self.doc_links
or '#' + ref in self.doc_links
or fname + '#' + ref in self.doc_links
)
def streamline_html(self, soup, file_name):
# Drop eventual header duplicated anchors
fname = file_name.replace(".html", "").lower()
page_anchors = set()
met_page_anchors = dict()
for lnk in soup.body.find_all("a"):
name = None
if "name" in lnk.attrs and lnk.string is None:
name = lnk.attrs["name"].lower()
if name in met_page_anchors:
# Such anchor already existed on this page, drop it
lnk.decompose()
met_page_anchors[name] = True
if name and name.lower() == fname:
lnk.decompose()
# Process divs
for i in soup.body.find_all('div'):
if "note" in i.get('class', []):
# Notes
del i['id']
if i.img:
i.img.decompose()
notetitle = i.find('span', class_='notetitle')
if notetitle:
title = soup.new_tag('div')
title['class'] = 'title'
title.string = 'Note:'
notetitle.replace_with(title)
elif "warning" in i.get('class', []):
# Warnings
del i['id']
if i.img:
i.img.decompose()
eltitle = i.find('span', class_='warningtitle')
if eltitle:
title = soup.new_tag('div')
title['class'] = 'title'
title.string = 'Warning:'
eltitle.replace_with(title)
elif "notice" in i.get('class', []):
# Notices
del i['id']
if i.img:
i.img.decompose()
i['class'] = 'important'
elif "caution" in i.get('class', []):
# Cautions
del i['id']
if i.img:
i.img.decompose()
elif "fignone" in i.get('class', []):
# Figures
# When we found figure generate local label (anchor)
if i.get('id'):
logging.debug('place figure label')
i.insert_before(self.make_label(soup, i.get("id")))
figure = soup.new_tag('figure')
img = i.find('img')
cap = i.find('span', class_='figcap')
if cap is not None:
cap.name = 'figcaption'
figure.append(cap)
if img:
# Store all referred images for copying
self.doc_images.add(img['src'])
img['src'] = '/_static/images/' + img['src']
del img["width"]
del img["height"]
del img["class"]
del img["title"]
figure.append(img)
i.replace_with(figure)
elif "section" in i.get('class', []):
# Sections
# When we found section generate local label (anchor)
if i.get('id'):
sec_id = i.get("id").lower()
if self.is_element_referred(sec_id, file_name):
logging.debug('Add section label')
page_anchors.add(sec_id)
i.insert_before(self.make_label(soup, sec_id))
# and still convert to paragraph
i.name = 'p'
else:
i.name = 'p'
# Process remaining images
for img in soup.body.find_all('img'):
if img['src'] and not img['src'].startswith('/_static/images'):
self.doc_images.add(img['src'])
img['src'] = '/_static/images/' + img['src']
del img["width"]
del img["height"]
del img["class"]
del img["title"]
# Drop strong in table headers "/"
for th in soup.body.find_all('th'):
if th.p.strong:
th.p.strong.unwrap()
if self.args.improve_table_headers:
# Add spaces around "/"
for th in soup.body.find_all('th'):
if hasattr(th, 'p') and th.p.string:
th.p.string = re.sub(
r'\b/\b',
' / ',
th.p.string)
# Drop strong around links "/"
for strong in soup.body.find_all('strong'):
if strong.a:
strong.unwrap()
# local anchors
for lnk in soup.body.find_all("a"):
if (
lnk.string is None
and hasattr(lnk, "name")
and not re.match(r"^li\d+$", lnk.attrs["name"])
# anywhere section
and not re.match(r".*section\d+$", lnk.attrs["name"])
# starts with table
and not re.match(r"^table\d+$", lnk.attrs["name"])
):
# Verify this is really called from somewhere:
local_ref = lnk["name"].lower()
if self.is_element_referred(local_ref, file_name):
# We now know something in the document wants this anchor -
# replace it with label
if local_ref not in page_anchors:
lnk.name = "p"
lnk.string = f"..\\_{local_ref}:"
del lnk["name"]
page_anchors.add(local_ref)
else:
logging.debug(
f"Not placing replaced anchor {local_ref} "
f" since it already existed")
else:
logging.debug("Dropping unreferred link")
for li in soup.body.find_all("li"):
del li['id']
# Sometimes we have code blocks with line numbers.
# <div class="codecoloring" codetype="xxx"><table class="xxx">
# <tr><td class="linenos"><div class="linenodiv"><pre>1
# 2
# 3</pre></div></td><td class="code"><div class="highlight"><pre>....
# </pre></div>
for td_lines in soup.body.find_all("td", "linenos"):
codeblock = td_lines.parent.find("td", "code")
table = td_lines.find_parent("table")
if codeblock and table:
# Replace whole table with only codeblock td
table.replace_with(codeblock)
for pre in soup.body.find_all("pre"):
text = pre.get_text()
# if text.startswith("{"):
# pre["class"] = "data"
if re.search(
r'\[[a-z]*@\w+.*\][\s#>]?',
text
):
# Something like "[root@ecs-test-0001 ~]#"
pre["class"] = "console"
elif re.match(
r'^(GET|PUT|POST|DELETE)',
text
):
# Something like "DELETE https://some_url"
pre["class"] = "text"
if "class" in pre and pre["class"] in ["codeblock"]:
pre["class"] = "text"
# And now specialities
rawize_strings = [
# "\*\*\*\*\*\*",
# r"([\\\/\:\*\?\"\~|<>]{4,})"
# ModelArts UMN contain this "unallowed" sequence
r"(\\/:\*\?\"<>\|)",
# CSS UMN contain "SELECT exclude('*name') FROM my-index"
r"(\*name)"
]
for to_rawize in rawize_strings:
for p in soup.body.find_all(string=re.compile(to_rawize)):
if p.string:
curr = p.string
part = re.search(to_rawize, curr)
if len(part.groups()) > 0:
new = curr.replace(
part.group(1),
f"<code>{part.group(1)}</code>"
)
p.replace_with(bs4.BeautifulSoup(new, 'html.parser'))
print(part.group(1))
print(f"New content is {p.string}")
else:
print('ups')
logging.error(f"String with star: {p}")
return soup.body
def main(self):
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(description='Process links.')
parser.add_argument(
'path', type=str, help='path to the files')
parser.add_argument(
'--improve-table-headers', action='store_true',
help='Improve table headers by enforcing spaces around `/`')
parser.add_argument(
'--pygments-lexer',
help='Set particular code-block lexer language')
parser.add_argument(
'--dest',
help='Directory to write resulting files')
parser.add_argument(
'--title',
required=True,
help='Document title')
parser.add_argument(
'--service',
help='Service to which the document belongs to')
parser.add_argument(
'--repo-name',
help='Service repository')
parser.add_argument(
'--pdf-name',
help='PDF File name')
parser.add_argument(
'--templates-location',
default='templates',
help='Location of additional templates')
self.args = parser.parse_args()
if self.args.dest:
dest = pathlib.Path(self.args.dest)
else:
dest = pathlib.Path(self.args.path, 'result')
dest.mkdir(parents=True, exist_ok=True)
metadata_file = pathlib.Path(
self.args.path, "CLASS.TXT.json")
meta_data = dict()
if not metadata_file.exists():
logging.warning(
f"CLASS.TXT.json file is missing in {self.args.path}, "
f"assuming initial import")
with open(pathlib.Path(dest, "index.rst"), "w") as index:
index.write('=' * (len(self.args.title)) + '\n')
index.write(self.args.title + '\n')
index.write('=' * (len(self.args.title)) + '\n')
index.write('\n')
else:
meta_data = json.loads(open(metadata_file).read())
metadata_by_uri = dict()
metadata_by_code = dict()
self.doc_images = set()
for f in meta_data:
f['new_name'] = self.get_new_name(f['title'])
metadata_by_uri[f['uri']] = f
metadata_by_code[f.get('code')] = f
tree = self.build_doc_tree(metadata_by_code)
pathlib.Path(self.args.path, "temp/").mkdir(
parents=True, exist_ok=True)
# Scan all docs for anchors
for f in pathlib.Path(self.args.path).glob("*.html"):
if f.name not in metadata_by_uri:
continue
# Registering section links
with open(f, 'r') as reader:
logging.debug(f"Scanning {f.name}")
content = reader.read()
soup = bs4.BeautifulSoup(content, "lxml")
for lnk in soup.body.find_all('a'):
if "name" in lnk.attrs and lnk.string is None:
anchor = lnk.attrs["name"]
title = re.sub('[ _:]', '-', anchor)
res = dict(
fname=f.name,
title=title,
replace=title.lower()
)
self.doc_anchors[anchor] = res
if "href" in lnk.attrs and lnk["href"]:
self.doc_links[lnk["href"].lower()] = f.name
for f in pathlib.Path(self.args.path).glob("*.html"):
if f.name not in metadata_by_uri:
continue
_target = metadata_by_uri[f.name]
target = _target['new_name']
target_path = self.get_target_path(
_target['p_code'], metadata_by_code)
pathlib.Path(self.args.path, "temp").mkdir(
parents=True, exist_ok=True)
pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
parents=True, exist_ok=True)
pathlib.Path(dest, target_path).mkdir(
parents=True, exist_ok=True)
# Pre-processing of html content
with open(f, 'r') as reader, \
open(pathlib.Path(self.args.path,
f"temp/{target}.tmp"), 'w') as writer:
# if f.name not in [
# ]:
# continue
logging.info(f"Pre-Processing {f} as {target}")
content = reader.read()
soup = bs4.BeautifulSoup(content, "lxml")
proc = self.streamline_html(soup, f.name)
for lnk in proc.find_all("a"):
href = lnk.get('href')
if href and not href.startswith('http'):
# Internal link - replace with :ref:
code = soup.new_tag('code')
code['class'] = "interpreted-text"
code['role'] = "ref"
href_parts = href.split('#')
if len(href_parts) > 1:
# for anchor just use anchor ref
link_target = href_parts[1].lower()
else:
# for other page - use only page name
link_target = href_parts[0].replace(
".html", "").lower()
if link_target:
# Looks like an anchor on the same page
code.string = f"{lnk.string} <{link_target}>"
logging.debug(f" replace {lnk} with {code}")
lnk.replace_with(code)
# Drop parent link at the bottom of the page
for parent in proc.find_all("p", class_="parentlink"):
parent.decompose()
logging.info(f'Saving file {writer.name}')
writer.write(str(proc))
# Convert html to rst
os.system(
f"pandoc '{self.args.path}/temp/{target}.tmp' -f html "
f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' "
f"--ascii -s --wrap none"
)
# Post processing of rendered rst
with open(f"{self.args.path}/tmp_result/"
f"{target_path}/{target}.rst", 'r') \
as reader, \
open(pathlib.Path(dest, target_path,
f"{target}.rst"), 'w') as writer:
logging.info(f"Post processing {target}...")
writer.write(f":original_name: {f.name}\n\n")
# Add root file label
writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
# post process some usual stuff
for line in reader.readlines():
processed_line = re.sub(r'\.\.\\_', '.. _', line)
processed_line = re.sub(r'', 'Y', processed_line)
processed_line = re.sub(
r'public_sys-resources/', '', processed_line)
processed_line = re.sub(
r' :name: .*$', '', processed_line)
processed_line = re.sub(
r'\*\*Parent topic:.*$', '', processed_line)
processed_line = re.sub(
r'.. code:: screen$',
r'.. code-block::', processed_line)
for lexer in ["json", "bash", "text", "console"]:
processed_line = re.sub(
f".. code:: {lexer}$",
f".. code-block:: {lexer}", processed_line)
if re.match(rf".. code:: {lexer}\s", processed_line):
logging.error(
f"'code-block: {lexer}' with something "
"afterwards")
exit(1)
# spaces are important, since code-block may reside inside
# of the cell
processed_line = re.sub(
r'.. code:: screen\s',
r'.. code-block:: ', processed_line)
processed_line = re.sub(
r'.. code:: codeblock$',
r'.. code-block::', processed_line)
processed_line = re.sub(r'[ \t]*$', '', processed_line)
writer.write(processed_line)
# Generate indexes
for k, v in tree.items():
path = ''
title = self.args.title
page_label = ''
if k != 0:
curr = metadata_by_code[k]
title = curr['title']
page_label = curr['uri'].replace(".html", "").lower()
path = self.get_target_path(curr['code'], metadata_by_code)
p = pathlib.Path(dest, f"{path}.rst")
if p.exists():
logging.warning(
f"{p.resolve()} is renamed into {path}/index.rst"
)
# Update existing index file
p.rename(pathlib.Path(dest, f"{path}/index.rst"))
with open(pathlib.Path(dest, path, "index.rst"), "a") as index:
index.write('\n')
index.write('.. toctree::\n')
index.write(' :maxdepth: 1\n')
index.write(' :hidden: \n\n')
for child in v:
new_name = child['new_name']
if child['code'] in tree:
# If this is folder - add /index
new_name = new_name + '/index'
index.write(f" {new_name}\n")
else:
with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
# New index file
if page_label:
index.write(f".. _{page_label}:\n\n")
index.write('=' * (len(title)) + '\n')
index.write(title + '\n')
index.write('=' * (len(title)) + '\n')
index.write('\n')
index.write('.. toctree::\n')
index.write(' :maxdepth: 1\n\n')
for child in v:
new_name = child['new_name']
if child['code'] in tree:
# If this is folder - add /index
new_name = new_name + '/index'
index.write(f" {new_name}\n")
# Copy used images
if len(self.doc_images) > 0:
logging.debug("Processing images...")
img_dest = pathlib.Path(dest, '_static', 'images')
img_dest.mkdir(parents=True, exist_ok=True)
for img in self.doc_images:
shutil.copyfile(
pathlib.Path(self.args.path, img).resolve(strict=False),
pathlib.Path(
img_dest, os.path.basename(img)).resolve(strict=False)
)
context = dict(
title=self.args.title,
project=self.args.service,
repo_name=self.args.repo_name,
pdf_name=self.args.pdf_name
)
loader = FileSystemLoader([self.args.templates_location])
env = Environment(loader=loader, autoescape=select_autoescape())
for f in loader.list_templates():
outfile_tmpl = env.get_template(f)
outfile_rendered = outfile_tmpl.render(**context)
target_file = pathlib.Path(self.args.dest, f)
target_file.parent.mkdir(parents=True, exist_ok=True)
with open(
target_file, 'w', encoding='utf-8', newline=''
) as out:
logging.debug(f"Generating {f} from template...")
out.write(outfile_rendered)
def main():
OTCDocConvertor().main()
if __name__ == "__main__":
main()