forked from docs/doc-exports
switch to bs4 for html processing
This commit is contained in:
parent
566b2f7ad4
commit
b9a16890bd
67
process.py
67
process.py
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import bs4
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
@ -49,6 +50,27 @@ def build_doc_tree(metadata):
|
|||||||
flat_tree[parent_id].append(v)
|
flat_tree[parent_id].append(v)
|
||||||
return flat_tree
|
return flat_tree
|
||||||
|
|
||||||
|
def flatten_html(soup):
|
||||||
|
for i in soup.body.find_all('div'):
|
||||||
|
if "note" in i.get('class', []):
|
||||||
|
del i['id']
|
||||||
|
if i.img:
|
||||||
|
i.img.decompose()
|
||||||
|
notetitle = i.find('span', class_='notetitle')
|
||||||
|
if notetitle:
|
||||||
|
title = soup.new_tag('div')
|
||||||
|
title['class'] = 'title'
|
||||||
|
title.string = 'Note:'
|
||||||
|
notetitle.replace_with(title)
|
||||||
|
#if i.p:
|
||||||
|
# i.p.unwrap()
|
||||||
|
else:
|
||||||
|
i.name = 'p'
|
||||||
|
for tbl in soup.body.find_all('table'):
|
||||||
|
tbl_id = tbl.get('id')
|
||||||
|
if tbl_id:
|
||||||
|
tbl['id'] = re.sub('[-_]', '', tbl_id)
|
||||||
|
return soup.body
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Process links.')
|
parser = argparse.ArgumentParser(description='Process links.')
|
||||||
@ -91,19 +113,46 @@ def main():
|
|||||||
# Pre-processing of html content
|
# Pre-processing of html content
|
||||||
with open(f, 'r') as reader, open(f"temp/{target}.tmp", 'w') as writer:
|
with open(f, 'r') as reader, open(f"temp/{target}.tmp", 'w') as writer:
|
||||||
print(f"Processing {target}")
|
print(f"Processing {target}")
|
||||||
for line in reader.readlines():
|
doc_anchors = dict()
|
||||||
|
content = reader.read()
|
||||||
|
soup = bs4.BeautifulSoup(content, "lxml")
|
||||||
|
proc = flatten_html(soup)
|
||||||
|
# Fix cross links
|
||||||
|
for lnk in proc.find_all("a"):
|
||||||
|
href = lnk.get('href')
|
||||||
|
if href:
|
||||||
|
page_url = ''
|
||||||
|
href_parts = href.split('#')
|
||||||
|
if href_parts[0] in rename_matrix:
|
||||||
|
page_url = ('../' * target_deepness) + \
|
||||||
|
rename_matrix[href_parts[0]]
|
||||||
|
else:
|
||||||
|
page_url = href_parts[0]
|
||||||
|
if len(href_parts) > 1:
|
||||||
|
lnk['href'] = (
|
||||||
|
f"{page_url}#" +
|
||||||
|
re.sub('[-_]', '', href_parts[1])
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
lnk['href'] = lnk['href'].replace(
|
||||||
|
href_parts[0],
|
||||||
|
page_url)
|
||||||
|
if not href:
|
||||||
|
lnk_name = lnk.get('name')
|
||||||
|
if (
|
||||||
|
lnk_name and not lnk.string
|
||||||
|
and not lnk_name in doc_anchors
|
||||||
|
):
|
||||||
|
lnk['id'] = lnk_name
|
||||||
|
doc_anchors[lnk_name] = 1
|
||||||
|
#writer.write(str(proc))
|
||||||
|
for line in str(proc).splitlines():
|
||||||
table_match = table_re.match(line)
|
table_match = table_re.match(line)
|
||||||
if table_match:
|
if table_match:
|
||||||
writer.write(f".. _{table_match.group(1)}:\n\n")
|
writer.write(f".. _{table_match.group(1)}:\n\n")
|
||||||
if not line.startswith("<strong>Parent topic:</strong>"):
|
if not line.startswith("<strong>Parent topic:</strong>"):
|
||||||
# Drop all divs
|
processed_line = line
|
||||||
processed_line = re.sub(r'<[/]?div[^>]*>', '', line)
|
writer.write(processed_line + '\n')
|
||||||
|
|
||||||
# Replace links to point to renamed files
|
|
||||||
for k, v in rename_matrix.items():
|
|
||||||
replace = ('../' * target_deepness) + v
|
|
||||||
processed_line = processed_line.replace(k, replace)
|
|
||||||
writer.write(processed_line)
|
|
||||||
# Convert html to rst
|
# Convert html to rst
|
||||||
os.system(
|
os.system(
|
||||||
f"pandoc 'temp/{target}.tmp' -f html "
|
f"pandoc 'temp/{target}.tmp' -f html "
|
||||||
|
Loading…
x
Reference in New Issue
Block a user