# pip install mistune beautifulsoup4 import argparse from base64 import b64encode from contextlib import suppress from glob import iglob from mimetypes import guess_type from os import makedirs from os.path import abspath, basename, dirname, join as joinpath, relpath import string from shutil import make_archive, rmtree import subprocess import mistune from bs4 import BeautifulSoup parser = argparse.ArgumentParser() parser.add_argument("--force", "-f", action="store_true", help="Ignore cached values and regenerate all material.") FORCE = parser.parse_args().force # This file resides in the _render directory. Content is in the parent. _self = dirname(abspath(__file__)) # Used only in this block. CONTENT_DIR = dirname(_self) OUTPUT_DIR = joinpath(_self, "output") with open(joinpath(_self, "template.html")) as f: TEMPLATE = string.Template(f.read()) # Grab header and footer HTML. render_markdown = mistune.Markdown(renderer=mistune.Renderer()) for path in iglob(joinpath(CONTENT_DIR, "**/*.md"), recursive=True): if ( # Don't render markdown that was in the output (shouldn't be any). path.startswith(OUTPUT_DIR) # Don't render markdown within the venv folder. or relpath(path, start=dirname(abspath(__file__))).startswith("venv") ): continue filedir = dirname(path) noextname = f"{basename(path)[:-3]}" with open(path) as f: html = render_markdown(f.read()) # Replace all images with base64 data-URIs. soup = BeautifulSoup(html, 'html.parser') for tag in soup.find_all("img"): with suppress(KeyError, FileNotFoundError): src = joinpath(filedir, tag["src"]) # Read image, construct b64 data URI and set it as src tag. with open(src, "rb") as f: bites = b64encode(f.read()).decode() # decode() default utf-8 tag["src"] = f"data:{guess_type(src)[0]};base64, {bites}" html = str(soup) # Cast updated string. # Cached HTML file determines whether we need to rerender. # Must be done after rendering the images as they too may have changed. with open(joinpath(filedir, f"{noextname}.html"), "a+") as cache: # Append-with-read (a+) sets pointer at end, but creates if necessary. # Less effort to reset the pointer than to create if not exists. cache.seek(0) if cache.read() == html and not FORCE: continue # Skip since not rendering. # Else replace the cache and proceed with rerendering. print("Rerendering", relpath(path, start=CONTENT_DIR)) cache.seek(0) cache.truncate() # size==tell() cache.write(html) # Target directory to build outputs for this markdown. target = joinpath(OUTPUT_DIR, relpath(filedir, start=CONTENT_DIR)) # Delete the old directory if exists, clearing old content. # WARNING: Assumes directory does not contain more than one markdown file. with suppress(FileNotFoundError): assert "output" in target, f"Bad deletion target {target}" # Safety. rmtree(target) # Recreate directory and parents as necessary in preparation for output. makedirs(target) # Write HTML files. with open(joinpath(target, f"{noextname}.html"), "w") as f: f.write(html) with open(joinpath(target, f"{noextname}.full.html"), "w") as f: f.write(TEMPLATE.substitute(html=html)) # Create PDF. At some point we should switch to Selenium/Puppeteer. subprocess.run( [ "D:/Libraries/Software/cli_tools/wkhtmltox/bin/wkhtmltopdf.exe", "-q", joinpath(target, f"{noextname}.full.html"), joinpath(target, f"{noextname}.pdf") ] ) print("Zipping...") make_archive(joinpath(CONTENT_DIR, "rendered"), "zip", OUTPUT_DIR)