switch to bs4 for html processing

2022-03-14 19:06:01 +01:00 · 2022-03-14 19:06:01 +01:00 · b9a16890bd
commit b9a16890bd
parent 566b2f7ad4
1 changed files with 58 additions and 9 deletions
--- a/process.py
+++ b/process.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import bs4
 import json
 import os
 import pathlib
@ -49,6 +50,27 @@ def build_doc_tree(metadata):
        flat_tree[parent_id].append(v)
    return flat_tree
 def flatten_html(soup):
    for i in soup.body.find_all('div'):
        if "note" in i.get('class', []):
            del i['id']
            if i.img:
                i.img.decompose()
            notetitle = i.find('span', class_='notetitle')
            if notetitle:
                title = soup.new_tag('div')
                title['class'] = 'title'
                title.string = 'Note:'
                notetitle.replace_with(title)
            #if i.p:
            #    i.p.unwrap()
        else:
            i.name = 'p'
    for tbl in soup.body.find_all('table'):
        tbl_id = tbl.get('id')
        if tbl_id:
            tbl['id'] = re.sub('[-_]', '', tbl_id)
    return soup.body
 def main():
    parser = argparse.ArgumentParser(description='Process links.')
@ -91,19 +113,46 @@ def main():
        # Pre-processing of html content
        with open(f, 'r') as reader, open(f"temp/{target}.tmp", 'w') as writer:
            print(f"Processing {target}")
-            for line in reader.readlines():
+            doc_anchors = dict()
            content = reader.read()
            soup = bs4.BeautifulSoup(content, "lxml")
            proc = flatten_html(soup)
            # Fix cross links
            for lnk in proc.find_all("a"):
                href = lnk.get('href')
                if href:
                    page_url = ''
                    href_parts = href.split('#')
                    if href_parts[0] in rename_matrix:
                        page_url = ('../' * target_deepness) + \
                            rename_matrix[href_parts[0]]
                    else:
                        page_url = href_parts[0]
                    if len(href_parts) > 1:
                        lnk['href'] = (
                            f"{page_url}#" +
                            re.sub('[-_]', '', href_parts[1])
                            )
                    else:
                        lnk['href'] = lnk['href'].replace(
                                href_parts[0],
                                page_url)
                if not href:
                    lnk_name = lnk.get('name')
                    if (
                            lnk_name and not lnk.string
                            and not lnk_name in doc_anchors
                    ):
                        lnk['id'] = lnk_name
                        doc_anchors[lnk_name] = 1
            #writer.write(str(proc))
            for line in str(proc).splitlines():
                table_match = table_re.match(line)
                if table_match:
                    writer.write(f".. _{table_match.group(1)}:\n\n")
                if not line.startswith("<strong>Parent topic:</strong>"):
-                    # Drop all divs
+                    processed_line = line
-                    processed_line = re.sub(r'<[/]?div[^>]*>', '', line)
+                    writer.write(processed_line + '\n')
                    # Replace links to point to renamed files
                    for k, v in rename_matrix.items():
                        replace = ('../' * target_deepness) + v
                        processed_line = processed_line.replace(k, replace)
                    writer.write(processed_line)
        # Convert html to rst
        os.system(
            f"pandoc 'temp/{target}.tmp' -f html "