forked from docs/doc-exports

Reviewed-by: gtema <artem.goncharov@gmail.com> Co-authored-by: Goncharov, Artem argoncha <artem.goncharov@t-systems.com> Co-committed-by: Goncharov, Artem argoncha <artem.goncharov@t-systems.com>
454 lines
18 KiB
Python
454 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
|
||
import argparse
|
||
import bs4
|
||
import json
|
||
import logging
|
||
import os
|
||
import pathlib
|
||
import re
|
||
import shutil
|
||
import sys
|
||
|
||
|
||
class OTCDocConvertor:
|
||
|
||
def __init__(self):
|
||
self.doc_anchors = dict()
|
||
self.doc_links = dict()
|
||
|
||
@staticmethod
|
||
def get_new_name(current_name):
|
||
new_name = current_name.replace(' - ', '_')
|
||
new_name = new_name.replace(' ', '_')
|
||
new_name = new_name.replace('/', '_')
|
||
new_name = new_name.replace('\'', '')
|
||
new_name = new_name.replace('"', '')
|
||
new_name = new_name.replace('`', '')
|
||
new_name = new_name.replace('´', '')
|
||
new_name = new_name.replace(':', '')
|
||
new_name = new_name.replace('?', '')
|
||
new_name = new_name.replace('(', '')
|
||
new_name = new_name.replace(')', '')
|
||
new_name = new_name.lower()
|
||
return new_name
|
||
|
||
@staticmethod
|
||
def build_doc_tree(metadata):
|
||
flat_tree = dict()
|
||
for k, v in metadata.items():
|
||
parent_id = v.get('p_code')
|
||
if not parent_id:
|
||
parent_id = 0
|
||
|
||
if parent_id not in flat_tree:
|
||
flat_tree[parent_id] = list()
|
||
flat_tree[parent_id].append(v)
|
||
return flat_tree
|
||
|
||
@classmethod
|
||
def get_target_path(cls, code, metadata, path=''):
|
||
if code in metadata:
|
||
current = metadata[code]
|
||
if not current.get('p_code'):
|
||
return current['new_name']
|
||
else:
|
||
return (
|
||
"{0}/{1}".format(
|
||
cls.get_target_path(current['p_code'], metadata),
|
||
current['new_name'])
|
||
)
|
||
else:
|
||
return ''
|
||
|
||
def make_label(self, soup, name):
|
||
label = soup.new_tag("p")
|
||
label.string = f"..\\_{name.lower()}:"
|
||
return label
|
||
|
||
def is_element_referred(self, ref, fname):
|
||
return (
|
||
ref in self.doc_links
|
||
or '#' + ref in self.doc_links
|
||
or fname + '#' + ref in self.doc_links
|
||
)
|
||
|
||
def streamline_html(self, soup, file_name):
|
||
# Drop eventual header duplicated anchors
|
||
fname = file_name.replace(".html", "").lower()
|
||
met_page_anchors = dict()
|
||
for lnk in soup.body.find_all("a"):
|
||
name = None
|
||
if "name" in lnk.attrs and lnk.string is None:
|
||
name = lnk.attrs["name"].lower()
|
||
if name in met_page_anchors:
|
||
# Such anchor already existed on this page, drop it
|
||
lnk.decompose()
|
||
met_page_anchors[name] = True
|
||
|
||
if name and name.lower() == fname:
|
||
lnk.decompose()
|
||
|
||
# Process divs
|
||
for i in soup.body.find_all('div'):
|
||
if "note" in i.get('class', []):
|
||
# Notes
|
||
del i['id']
|
||
if i.img:
|
||
i.img.decompose()
|
||
notetitle = i.find('span', class_='notetitle')
|
||
if notetitle:
|
||
title = soup.new_tag('div')
|
||
title['class'] = 'title'
|
||
title.string = 'Note:'
|
||
notetitle.replace_with(title)
|
||
elif "notice" in i.get('class', []):
|
||
# Notices
|
||
del i['id']
|
||
if i.img:
|
||
i.img.decompose()
|
||
i['class'] = 'important'
|
||
elif "caution" in i.get('class', []):
|
||
# Cautions
|
||
del i['id']
|
||
if i.img:
|
||
i.img.decompose()
|
||
elif "fignone" in i.get('class', []):
|
||
# Figures
|
||
# When we found figure generate local label (anchor)
|
||
if i.get('id'):
|
||
logging.debug('place figure label')
|
||
i.insert_before(self.make_label(soup, i.get("id")))
|
||
figure = soup.new_tag('figure')
|
||
img = i.find('img')
|
||
cap = i.find('span', class_='figcap')
|
||
if cap is not None:
|
||
cap.name = 'figcaption'
|
||
figure.append(cap)
|
||
if img:
|
||
# Store all referred images for copying
|
||
self.doc_images.add(img['src'])
|
||
img['src'] = '/_static/images/' + img['src']
|
||
figure.append(img)
|
||
i.replace_with(figure)
|
||
elif "section" in i.get('class', []):
|
||
# Sections
|
||
# When we found section generate local label (anchor)
|
||
if i.get('id'):
|
||
sec_id = i.get("id").lower()
|
||
if self.is_element_referred(sec_id, file_name):
|
||
logging.debug('Add section label')
|
||
i.insert_before(self.make_label(soup, sec_id))
|
||
# and still convert to paragraph
|
||
i.name = 'p'
|
||
else:
|
||
i.name = 'p'
|
||
|
||
# Process remaining images
|
||
for img in soup.body.find_all('img'):
|
||
if img['src'] and not img['src'].startswith('/_static/images'):
|
||
self.doc_images.add(img['src'])
|
||
img['src'] = '/_static/images/' + img['src']
|
||
|
||
# Drop strong in table headers "/"
|
||
for th in soup.body.find_all('th'):
|
||
if th.p.strong:
|
||
th.p.strong.unwrap()
|
||
|
||
if self.args.improve_table_headers:
|
||
# Add spaces around "/"
|
||
for th in soup.body.find_all('th'):
|
||
if hasattr(th, 'p') and th.p.string:
|
||
th.p.string = re.sub(
|
||
r'\b/\b',
|
||
' / ',
|
||
th.p.string)
|
||
|
||
# local anchors
|
||
for lnk in soup.body.find_all("a"):
|
||
if (
|
||
lnk.string is None
|
||
and hasattr(lnk, "name")
|
||
and not re.match(r"^li\d+$", lnk.attrs["name"])
|
||
# anywhere section
|
||
and not re.match(r".*section\d+$", lnk.attrs["name"])
|
||
# starts with table
|
||
and not re.match(r"^table\d+$", lnk.attrs["name"])
|
||
):
|
||
# Verify this is really called from somewhere:
|
||
local_ref = lnk["name"].lower()
|
||
if self.is_element_referred(local_ref, file_name):
|
||
# We now know something in the document wants this anchor -
|
||
# replace it with label
|
||
lnk.name = "p"
|
||
lnk.string = f"..\\_{local_ref}:"
|
||
del lnk["name"]
|
||
else:
|
||
logging.debug("Dropping unreferred link")
|
||
|
||
for li in soup.body.find_all("li"):
|
||
del li['id']
|
||
|
||
for pre in soup.body.find_all("pre"):
|
||
text = pre.get_text()
|
||
# if text.startswith("{"):
|
||
# pre["class"] = "data"
|
||
if re.search(
|
||
r'\[[a-z]*@\w+.*\][\s#>]?',
|
||
text
|
||
):
|
||
# Something like "[root@ecs-test-0001 ~]#"
|
||
pre["class"] = "console"
|
||
elif re.match(
|
||
r'^(GET|PUT|POST|DELETE)',
|
||
text
|
||
):
|
||
# Something like "DELETE https://some_url"
|
||
pre["class"] = "text"
|
||
|
||
# And now specialities
|
||
rawize_strings = [
|
||
# "\*\*\*\*\*\*",
|
||
# r"([\\\/\:\*\?\"\~|<>]{4,})"
|
||
# ModelArts UMN contain this "unallowed" sequence
|
||
r"(\\/:\*\?\"<>\|)"
|
||
]
|
||
for to_rawize in rawize_strings:
|
||
for p in soup.body.find_all(string=re.compile(to_rawize)):
|
||
if p.string:
|
||
curr = p.string
|
||
part = re.search(to_rawize, curr)
|
||
if len(part.groups()) > 0:
|
||
new = curr.replace(
|
||
part.group(1),
|
||
f"<code>{part.group(1)}</code>"
|
||
)
|
||
p.replace_with(bs4.BeautifulSoup(new, 'html.parser'))
|
||
print(part.group(1))
|
||
print(f"New content is {p.string}")
|
||
else:
|
||
print('ups')
|
||
logging.error(f"String with star: {p}")
|
||
|
||
return soup.body
|
||
|
||
def main(self):
|
||
logging.basicConfig(level=logging.DEBUG)
|
||
parser = argparse.ArgumentParser(description='Process links.')
|
||
parser.add_argument(
|
||
'path', type=str, help='path to the files')
|
||
parser.add_argument(
|
||
'--improve-table-headers', action='store_true',
|
||
help='Improve table headers by enforcing spaces around `/`')
|
||
parser.add_argument(
|
||
'--pygments-lexer',
|
||
help='Set particular code-block lexer language')
|
||
parser.add_argument(
|
||
'--dest',
|
||
help='Directory to write resulting files')
|
||
self.args = parser.parse_args()
|
||
if self.args.dest:
|
||
dest = pathlib.Path(self.args.dest)
|
||
else:
|
||
dest = pathlib.Path(self.args.path, 'result')
|
||
dest.mkdir(parents=True, exist_ok=True)
|
||
|
||
metadata_file = pathlib.Path(
|
||
self.args.path, "CLASS.TXT.json")
|
||
|
||
if not metadata_file.exists():
|
||
logging.warning(
|
||
f"CLASS.TXT.json file is missing in {self.args.path}, "
|
||
f"assuming initial import")
|
||
sys.exit(0)
|
||
meta_data = json.loads(open(metadata_file).read())
|
||
metadata_by_uri = dict()
|
||
metadata_by_code = dict()
|
||
self.doc_images = set()
|
||
for f in meta_data:
|
||
f['new_name'] = self.get_new_name(f['title'])
|
||
metadata_by_uri[f['uri']] = f
|
||
metadata_by_code[f.get('code')] = f
|
||
|
||
tree = self.build_doc_tree(metadata_by_code)
|
||
|
||
pathlib.Path(self.args.path, "temp/").mkdir(
|
||
parents=True, exist_ok=True)
|
||
|
||
# Scan all docs for anchors
|
||
for f in pathlib.Path(self.args.path).glob("*.html"):
|
||
if f.name not in metadata_by_uri:
|
||
continue
|
||
# Registering section links
|
||
with open(f, 'r') as reader:
|
||
logging.debug(f"Scanning {f.name}")
|
||
content = reader.read()
|
||
soup = bs4.BeautifulSoup(content, "lxml")
|
||
for lnk in soup.body.find_all('a'):
|
||
if "name" in lnk.attrs and lnk.string is None:
|
||
anchor = lnk.attrs["name"]
|
||
title = re.sub('[ _:]', '-', anchor)
|
||
res = dict(
|
||
fname=f.name,
|
||
title=title,
|
||
replace=title.lower()
|
||
)
|
||
self.doc_anchors[anchor] = res
|
||
if "href" in lnk.attrs and lnk["href"]:
|
||
self.doc_links[lnk["href"].lower()] = f.name
|
||
|
||
for f in pathlib.Path(self.args.path).glob("*.html"):
|
||
if f.name not in metadata_by_uri:
|
||
continue
|
||
_target = metadata_by_uri[f.name]
|
||
target = _target['new_name']
|
||
target_path = self.get_target_path(
|
||
_target['p_code'], metadata_by_code)
|
||
pathlib.Path(self.args.path, "temp").mkdir(
|
||
parents=True, exist_ok=True)
|
||
pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
|
||
parents=True, exist_ok=True)
|
||
pathlib.Path(dest, target_path).mkdir(
|
||
parents=True, exist_ok=True)
|
||
|
||
# Pre-processing of html content
|
||
with open(f, 'r') as reader, \
|
||
open(pathlib.Path(self.args.path,
|
||
f"temp/{target}.tmp"), 'w') as writer:
|
||
# if f.name not in [
|
||
# ]:
|
||
# continue
|
||
logging.info(f"Pre-Processing {f} as {target}")
|
||
content = reader.read()
|
||
soup = bs4.BeautifulSoup(content, "lxml")
|
||
proc = self.streamline_html(soup, f.name)
|
||
|
||
for lnk in proc.find_all("a"):
|
||
href = lnk.get('href')
|
||
if href and not href.startswith('http'):
|
||
# Internal link - replace with :ref:
|
||
code = soup.new_tag('code')
|
||
code['class'] = "interpreted-text"
|
||
code['role'] = "ref"
|
||
href_parts = href.split('#')
|
||
if len(href_parts) > 1:
|
||
# for anchor just use anchor ref
|
||
link_target = href_parts[1].lower()
|
||
else:
|
||
# for other page - use only page name
|
||
link_target = href_parts[0].replace(
|
||
".html", "").lower()
|
||
if link_target:
|
||
# Looks like an anchor on the same page
|
||
code.string = f"{lnk.string} <{link_target}>"
|
||
logging.debug(f" replace {lnk} with {code}")
|
||
lnk.replace_with(code)
|
||
|
||
# Drop parent link at the bottom of the page
|
||
for parent in proc.find_all("p", class_="parentlink"):
|
||
parent.decompose()
|
||
|
||
logging.info(f'Saving file {writer.name}')
|
||
writer.write(str(proc))
|
||
|
||
# Convert html to rst
|
||
os.system(
|
||
f"pandoc '{self.args.path}/temp/{target}.tmp' -f html "
|
||
f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' "
|
||
f"--ascii -s --wrap none"
|
||
)
|
||
# Post processing of rendered rst
|
||
with open(f"{self.args.path}/tmp_result/"
|
||
f"{target_path}/{target}.rst", 'r') \
|
||
as reader, \
|
||
open(pathlib.Path(dest, target_path,
|
||
f"{target}.rst"), 'w') as writer:
|
||
logging.info(f"Post processing {target}")
|
||
writer.write(f":original_name: {f.name}\n\n")
|
||
# Add root file label
|
||
writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
|
||
# post process some usual stuff
|
||
for line in reader.readlines():
|
||
processed_line = re.sub(r'\.\.\\_', '.. _', line)
|
||
processed_line = re.sub(r'√', 'Y', processed_line)
|
||
processed_line = re.sub(
|
||
r'public_sys-resources/', '', processed_line)
|
||
processed_line = re.sub(
|
||
r' :name: .*$', '', processed_line)
|
||
processed_line = re.sub(
|
||
r'\*\*Parent topic:.*$', '', processed_line)
|
||
processed_line = re.sub(
|
||
r'.. code:: screen$',
|
||
r'.. code-block::', processed_line)
|
||
for lexer in ["json", "bash", "text", "console"]:
|
||
processed_line = re.sub(
|
||
f".. code:: {lexer}$",
|
||
f".. code-block:: {lexer}", processed_line)
|
||
if re.match(rf".. code:: {lexer}\s", processed_line):
|
||
logging.error(
|
||
f"'code-block: {lexer}' with something "
|
||
"afterwards")
|
||
exit(1)
|
||
# spaces are important, since code-block may reside inside
|
||
# of the cell
|
||
processed_line = re.sub(
|
||
r'.. code:: screen\s',
|
||
r'.. code-block:: ', processed_line)
|
||
processed_line = re.sub(
|
||
r'.. code:: codeblock$',
|
||
r'.. code-block::', processed_line)
|
||
processed_line = re.sub(r'[ \t]*$', '', processed_line)
|
||
writer.write(processed_line)
|
||
|
||
# Generate indexes
|
||
for k, v in tree.items():
|
||
path = ''
|
||
title = 'Main Index'
|
||
page_label = ''
|
||
if k != 0:
|
||
curr = metadata_by_code[k]
|
||
title = curr['title']
|
||
page_label = curr['uri'].replace(".html", "").lower()
|
||
path = self.get_target_path(curr['code'], metadata_by_code)
|
||
with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
|
||
if page_label:
|
||
index.write(f".. _{page_label}:\n\n")
|
||
index.write('=' * (len(title)) + '\n')
|
||
index.write(title + '\n')
|
||
index.write('=' * (len(title)) + '\n')
|
||
index.write('\n')
|
||
index.write('.. toctree::\n')
|
||
index.write(' :maxdepth: 1\n\n')
|
||
for child in v:
|
||
new_name = child['new_name']
|
||
if child['code'] in tree:
|
||
# If this is folder - add /index
|
||
new_name = new_name + '/index'
|
||
index.write(f" {new_name}\n")
|
||
|
||
p = pathlib.Path(dest, f"{path}.rst")
|
||
if p.exists():
|
||
logging.warning(
|
||
f"{p.resolve()} is removed in favour"
|
||
f" of result/{path}/index.rst")
|
||
p.unlink()
|
||
|
||
# Copy used images
|
||
if len(self.doc_images) > 0:
|
||
logging.debug("Processing images")
|
||
img_dest = pathlib.Path(dest, '_static', 'images')
|
||
img_dest.mkdir(parents=True, exist_ok=True)
|
||
for img in self.doc_images:
|
||
shutil.copyfile(
|
||
pathlib.Path(self.args.path, img).resolve(strict=False),
|
||
pathlib.Path(img_dest, img).resolve(strict=False)
|
||
)
|
||
|
||
|
||
def main():
|
||
OTCDocConvertor().main()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|