register images outside figures (#10)

register images outside figures

Reviewed-by: OpenTelekomCloud Bot <None>
This commit is contained in:
Artem Goncharov 2022-05-09 17:07:57 +02:00 committed by GitHub
parent 81c5e8ac5c
commit bc99fce71a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -143,6 +143,12 @@ class OTCDocConvertor:
else: else:
i.name = 'p' i.name = 'p'
# Process remaining images
for img in soup.body.find_all('img'):
if img['src'] and not img['src'].startswith('/_static/images'):
self.doc_images.add(img['src'])
img['src'] = '/_static/images/' + img['src']
# Drop strong in table headers "/" # Drop strong in table headers "/"
for th in soup.body.find_all('th'): for th in soup.body.find_all('th'):
if th.p.strong: if th.p.strong:
@ -203,6 +209,8 @@ class OTCDocConvertor:
rawize_strings = [ rawize_strings = [
# "\*\*\*\*\*\*", # "\*\*\*\*\*\*",
# r"([\\\/\:\*\?\"\~|<>]{4,})" # r"([\\\/\:\*\?\"\~|<>]{4,})"
# ModelArts UMN contain this "unallowed" sequence
r"(\\/:\*\?\"<>\|)"
] ]
for to_rawize in rawize_strings: for to_rawize in rawize_strings:
for p in soup.body.find_all(string=re.compile(to_rawize)): for p in soup.body.find_all(string=re.compile(to_rawize)):
@ -217,6 +225,8 @@ class OTCDocConvertor:
p.replace_with(bs4.BeautifulSoup(new, 'html.parser')) p.replace_with(bs4.BeautifulSoup(new, 'html.parser'))
print(part.group(1)) print(part.group(1))
print(f"New content is {p.string}") print(f"New content is {p.string}")
else:
print('ups')
logging.error(f"String with star: {p}") logging.error(f"String with star: {p}")
return soup.body return soup.body
@ -298,9 +308,8 @@ class OTCDocConvertor:
open(pathlib.Path(self.args.path, open(pathlib.Path(self.args.path,
f"temp/{target}.tmp"), 'w') as writer: f"temp/{target}.tmp"), 'w') as writer:
# if f.name not in [ # if f.name not in [
# "modelarts_21_0031.html", # ]:
# "en-us_topic_0032380449.html"]: # continue
# continue
logging.info(f"Pre-Processing {f} as {target}") logging.info(f"Pre-Processing {f} as {target}")
content = reader.read() content = reader.read()
soup = bs4.BeautifulSoup(content, "lxml") soup = bs4.BeautifulSoup(content, "lxml")
@ -356,9 +365,6 @@ class OTCDocConvertor:
processed_line = re.sub(r'', 'Y', processed_line) processed_line = re.sub(r'', 'Y', processed_line)
processed_line = re.sub( processed_line = re.sub(
r'public_sys-resources/', '', processed_line) r'public_sys-resources/', '', processed_line)
processed_line = re.sub(
r'image:: ', 'image:: /_static/images/',
processed_line)
processed_line = re.sub( processed_line = re.sub(
r' :name: .*$', '', processed_line) r' :name: .*$', '', processed_line)
processed_line = re.sub( processed_line = re.sub(