www/build.py

#!/usr/bin/env python

import argparse
from datetime import datetime, timezone
import copy
import os
import pathlib
import shutil

import bs4
from feedgen.feed import FeedGenerator
import markdown


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("out", help="output directory")
    args = parser.parse_args()

    src = pathlib.Path("src")
    out = pathlib.Path(args.out)

    md = markdown.Markdown(extensions=[
        # Set HTML attributes with {#id}
        "attr_list",
        # Footnotes [^1]
        "footnotes",
        # Parse markdown within HTML[markdown] blocks
        "md_in_html",
        # "YAML" frontmatter metadata
        "meta",
    ])
    comment_md = markdown.Markdown()

    # Map of feed url -> FeedGenerator object
    feeds = {}
    build_date = datetime.now(timezone.utc)

    # Load the template
    template = bs4.BeautifulSoup(
        (src / ".template.html").read_text(encoding="utf8"),
        features="html.parser")

    count = 0
    for dirpath, _, filenames in os.walk(src):
        dirpath = pathlib.Path(dirpath).relative_to(src)
        for filename in filenames:
            if filename[0] == ".":
                continue  # Skip dotfiles
            count += 1

            path = src / dirpath / filename
            dest = out / dirpath / filename
            os.makedirs(dest.parent, exist_ok=True)

            # Copy any file types without content processing
            if filename.rsplit(".")[-1] not in ("md",):
                print("Copying   ", path)
                shutil.copyfile(path, dest)
                continue

            # Preprocess markdown into html
            if dest.name.endswith(".md"):
                print("Converting", path)
                md.reset()
                dest = dest.with_suffix(".html")
                content = md.convert(path.read_text(encoding="utf8"))
                meta = md.Meta or {}

                # Inject content into the template
                page_content = bs4.BeautifulSoup(content, features="html.parser")
                page = copy.copy(template)
                article = page.new_tag("article")
                article.append(page_content)
                page.article.replace_with(article)

                # Rewrite links with markdown extensions
                for a in page.css.select("a[href]"):
                    if a["href"].endswith(".md"):
                        a["href"] = a["href"][:-3] + ".html"

                # Inject path into the nav
                for i in range(len(dirpath.parts)):
                    a = page.new_tag("a")
                    a["href"] = "/" + "/".join(dirpath.parts[:i+1]) + "/"
                    a.string = dirpath.parts[i]
                    page.nav.append(a)
                    page.nav.append(page.new_string("/"))

                # Apply metadata to the template
                if meta_title := meta.get("title"):
                    page.title.string = meta_title[0]
                    page.header.h1.string = meta_title[0]

                if meta_date := meta.get("date"):
                    p = page.new_tag("p")
                    p["class"] = "metadata"
                    p.string = "Date: " + meta_date[0]
                    page.header.append(p)

                if meta_author := meta.get("author"):
                    p = page.new_tag("p")
                    p["class"] = "metadata"
                    p.string = "Author: " + meta_author[0]
                    page.header.append(p)

                if meta_source := meta.get("source"):
                    for source_url in meta_source:
                        a = page.new_tag("a")
                        a["href"] = source_url
                        a.string = source_url
                        p = page.new_tag("p")
                        p["class"] = "metadata"
                        p.string = "URL: "
                        p.append(a)
                        page.header.append(p)

                if meta_comment := meta.get("comment"):
                    for comment in meta_comment:
                        aside = page.new_tag("aside")
                        html = bs4.BeautifulSoup(comment_md.convert(comment), features="html.parser")
                        aside.extend(html.p.contents)
                        page.header.append(aside)

                # RSS metadata
                if "feed" in meta and "pubdate" in meta:
                    pubdate = datetime.fromisoformat(meta["pubdate"][0])
                    link = f"https://www.alogoulogoi.com/{dest.relative_to(out).as_posix()}"
                    for feed in meta["feed"]:
                        if feed not in feeds:
                            feeds[feed] = []
                        feeds[feed].append({
                            "title": meta_title[0],
                            "link": link,
                            "description": "",
                            "pubdate": pubdate,
                        })

                # Write the fully templated page
                print("Writing   ", dest)
                dest.write_text(str(page))

    for feed, items in feeds.items():
        fg = FeedGenerator()
        fg.title(f"alogoulogoi /{feed}/")
        fg.link(href=f"https://www.alogoulogoi.com/{feed}/feed.xml")
        fg.description("Blog posts from alogoulogoi")
        fg.language("en-us")
        fg.lastBuildDate(build_date)
        for item in sorted(items, key=lambda i: i["pubdate"]):
            entry = fg.add_entry()
            entry.title(item["title"])
            entry.link(href=item["link"])
            entry.description(item["description"])
            entry.published(item["pubdate"])
        rss_path = (out / feed / "feed.xml")
        os.makedirs(rss_path.parent, exist_ok=True)
        rss_path.write_bytes(fg.rss_str(pretty=True))

    print("Processed", count, "files")


if __name__ == "__main__":
    main()