From f05664fc1c3816e54edb1641709a9e364d9b43d5 Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Sun, 1 Jul 2018 13:12:57 -0700 Subject: [PATCH] Refactor article and build code --- lexipython.py | 70 ++++++++++- src/article.py | 195 +++++++++++++++++++++++++++++++ src/build.py | 306 +++---------------------------------------------- 3 files changed, 280 insertions(+), 291 deletions(-) create mode 100644 src/article.py diff --git a/lexipython.py b/lexipython.py index 0f65315..c41551c 100644 --- a/lexipython.py +++ b/lexipython.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -# Check for the right Python version import sys if sys.version_info[0] < 3: raise Exception("Lexipython requires Python 3") @@ -9,6 +8,8 @@ import argparse import os import re import json +from src.article import LexiconArticle +import src.build as build import src.utils as utils def is_lexicon(name): @@ -143,12 +144,77 @@ def command_init(name): # Create an empty status file open(os.path.join(lex_path, "status"), "w").close() print("Created Lexicon {}".format(name)) + # Done initializing + return def command_build(name): """ Rebuilds the browsable pages of a Lexicon. """ - pass + # Load the Lexicon's peripherals + config = utils.load_config(name) + entry_skeleton = utils.load_resource("entry-page.html") + css = utils.load_resource("lexicon.css") + # Parse the written articles + articles = LexiconArticle.parse_from_directory(os.path.join("lexicon", name, "src")) + # At this point, the articles haven't been cross-populated, + # so we can derive the written titles from this list + written_titles = [article.title for article in articles] + articles = sorted( + LexiconArticle.populate(articles), + key=lambda a: utils.titlestrip(a.title)) + #phantom_titles = [article.title for article in articles if article.title not in written_titles] + lex_path = os.path.join("lexicon", name) + def pathto(*els): + return os.path.join(lex_path, *els) + + # Write the redirect page + print("Writing redirect page...") + with open(pathto("index.html"), "w", encoding="utf8") as f: + f.write(utils.load_resource("redirect.html").format(lexicon=config["LEXICON_TITLE"])) + + # Write the article pages + print("Deleting old article pages...") + for filename in os.listdir(pathto("article")): + if filename[-5:] == ".html": + os.remove(pathto("article", filename)) + print("Writing article pages...") + l = len(articles) + for idx in range(l): + article = articles[idx] + with open(pathto("article", article.title_filesafe + ".html"), "w", encoding="utf8") as f: + content = article.build_default_content() + citeblock = article.build_default_citeblock( + None if idx == 0 else articles[idx - 1].title, + None if idx == l-1 else articles[idx + 1].title) + article_html = entry_skeleton.format( + title = article.title, + lexicon = config["LEXICON_TITLE"], + css = css, + logo = config["LOGO_FILENAME"], + prompt = config["PROMPT"], + content = content, + citeblock = citeblock) + f.write(article_html) + print(" Wrote " + article.title) + + # Write default pages + print("Writing default pages...") + with open(pathto("contents", "index.html"), "w", encoding="utf8") as f: + f.write(build.build_contents_page(articles, config)) + print(" Wrote Contents") + with open(pathto("rules", "index.html"), "w", encoding="utf8") as f: + f.write(build.build_rules_page(config)) + print(" Wrote Rules") + with open(pathto("formatting", "index.html"), "w", encoding="utf8") as f: + f.write(build.build_formatting_page(config)) + print(" Wrote Formatting") + with open(pathto("session", "index.html"), "w", encoding="utf8") as f: + f.write(build.build_session_page(config)) + print(" Wrote Session") + with open(pathto("statistics", "index.html"), "w", encoding="utf8") as f: + f.write(build.build_statistics_page(articles, config)) + print(" Wrote Statistics") def command_run(name): """ diff --git a/src/article.py b/src/article.py new file mode 100644 index 0000000..c809054 --- /dev/null +++ b/src/article.py @@ -0,0 +1,195 @@ +import os +import sys +import re +import src.utils as utils + +class LexiconArticle: + """ + A Lexicon article and its metadata. + + Members: + author string: the author of the article + turn integer: the turn the article was written for + title string: the article title + title_filesafe string: the title, escaped, used for filenames + content string: the HTML content, with citations replaced by format hooks + citations dict mapping format hook string to tuple of link alias and link target title + wcites list: titles of written articles cited + pcites list: titles of phantom articles cited + citedby list: titles of articles that cite this + The last three are filled in by populate(). + """ + + def __init__(self, author, turn, title, content, citations): + """ + Creates a LexiconArticle object with the given parameters. + """ + self.author = author + self.turn = turn + self.title = title + self.title_filesafe = utils.titleescape(title) + self.content = content + self.citations = citations + self.wcites = set() + self.pcites = set() + self.citedby = set() + + @staticmethod + def from_file_raw(raw_content): + """ + Parses the contents of a Lexipython source file into a LexiconArticle + object. If the source file is malformed, returns None. + """ + headers = raw_content.split('\n', 3) + if len(headers) != 4: + print("Header read error") + return None + author_header, turn_header, title_header, content_raw = headers + # Validate and sanitize the author header + if not author_header.startswith("# Author:"): + print("Author header missing or corrupted") + return None + author = author_header[9:].strip() + # Validate and sanitize the turn header + if not turn_header.startswith("# Turn:"): + print("Turn header missing or corrupted") + return None + turn = None + try: + turn = int(turn_header[7:].strip()) + except: + print("Turn header error") + return None + # Validate and sanitize the title header + if not title_header.startswith("# Title:"): + print("Title header missing or corrupted") + return None + title = utils.titlecase(title_header[8:]) + # Parse the content and extract citations + paras = re.split("\n\n+", content_raw.strip()) + content = "" + citations = {} + format_id = 1 + if not paras: + print("No content") + for para in paras: + # Escape angle brackets + para = re.sub("<", "<", para) + para = re.sub(">", ">", para) + # Replace bold and italic marks with tags + para = re.sub(r"//([^/]+)//", r"\1", para) + para = re.sub(r"\*\*([^*]+)\*\*", r"\1", para) + # Replace \\LF with
LF + para = re.sub(r"\\\\\n", "
\n", para) + # Abstract citations into the citation record + link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para) + while link_match: + # Identify the citation text and cited article + cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3) + cite_title = utils.titlecase(link_match.group(3)) + # Record the citation + citations["c"+str(format_id)] = (cite_text, cite_title) + # Stitch the format id in place of the citation + para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):] + format_id += 1 # Increment to the next format citation + link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para) + # Convert signature to right-aligned + if para[:1] == '~': + para = "

" + para[1:] + "

\n" + else: + para = "

" + para + "

\n" + content += para + return LexiconArticle(author, turn, title, content, citations) + + @staticmethod + def parse_from_directory(directory): + """ + Reads and parses each source file in the given directory. + Input: directory, the path to the folder to read + Output: a list of parsed articles + """ + articles = [] + print("Reading source files from", directory) + for filename in os.listdir(directory): + path = os.path.join(directory, filename) + # Read only .txt files + if filename[-4:] == ".txt": + print(" Parsing", filename) + with open(path, "r", encoding="utf8") as src_file: + raw = src_file.read() + article = LexiconArticle.from_file_raw(raw) + if article is None: + print(" ERROR") + else: + print(" success:", article.title) + articles.append(article) + return articles + + @staticmethod + def populate(lexicon_articles): + """ + Given a list of lexicon articles, fills out citation information + for each article and creates phantom pages for missing articles. + """ + article_by_title = {article.title : article for article in lexicon_articles} + # Determine all articles that exist or should exist + extant_titles = set([citation[1] for article in lexicon_articles for citation in article.citations]) + # Interlink all citations + for article in lexicon_articles: + for cite_tuple in article.citations.values(): + target = cite_tuple[1] + # Create article objects for phantom citations + if target not in article_by_title: + article_by_title[target] = LexiconArticle(None, sys.maxsize, target, "

This entry hasn't been written yet.

", {}) + # Interlink citations + if article_by_title[target].author is None: + article.pcites.add(target) + else: + article.wcites.add(target) + article_by_title[target].citedby.add(article.title) + return list(article_by_title.values()) + + def build_default_content(self): + """ + Formats citations into the article content as normal HTML links + and returns the result. + """ + format_map = { + format_id: "{0}".format( + cite_tuple[0], utils.titleescape(cite_tuple[1]), + "" if cite_tuple[1] in self.wcites else " class=\"phantom\"") + for format_id, cite_tuple in self.citations.items() + } + return self.content.format(**format_map) + + def build_default_citeblock(self, prev_target, next_target): + """ + Builds the citeblock content HTML for use in regular article pages. + For each defined target, links the target page as Previous or Next. + """ + citeblock = "
\n" + # Prev/next links + if next_target is not None: + citeblock += "

Next →

\n".format(utils.titleescape(next_target)) + if prev_target is not None: + citeblock += "

← Previous

\n".format(utils.titleescape(prev_target)) + elif next_target is not None: + citeblock += "

 

\n" + # Citations + cites_links = [ + "{0}".format( + title, utils.titleescape(title), + "" if title in self.wcites else " class=\"phantom\"") + for title in sorted(self.wcites | self.pcites)] + cites_str = " | ".join(cites_links) + if len(cites_str) < 1: cites_str = "--" + citeblock += "

Citations: {}

\n".format(cites_str) + # Citedby + citedby_links = [ + "{0}".format( + title, utils.titleescape(title)) + for title in self.citedby] + citedby_str = " | ".join(citedby_links) + if len(citedby_str) < 1: citedby_str = "--" + citeblock += "

Cited by: {}

\n
\n".format(citedby_str) + return citeblock diff --git a/src/build.py b/src/build.py index d81bd65..7c783d6 100644 --- a/src/build.py +++ b/src/build.py @@ -1,7 +1,3 @@ -############################### -## Lexipython Lexicon engine ## -############################### - import sys # For argv and stderr import os # For reading directories import re # For parsing lex content @@ -9,200 +5,7 @@ import io # For writing pages out as UTF-8 import networkx # For pagerank analytics from collections import defaultdict # For rank inversion in statistics -# Main article class - -class LexiconArticle: - """ - A Lexicon article and its metadata. - - Members: - author string: the author of the article - turn integer: the turn the article was written for - title string: the article title - title_filesafe string: the title, escaped, used for filenames - content string: the HTML content, with citations replaced by format hooks - citations dict from format hook string to tuple of link alias and link target title - wcites list: titles of written articles cited - pcites list: titles of phantom articles cited - citedby list: titles of articles that cite this - The last three are filled in by populate(). - """ - - def __init__(self, author, turn, title, content, citations): - """ - Creates a LexiconArticle object with the given parameters. - """ - self.author = author - self.turn = turn - self.title = title - self.title_filesafe = titleescape(title) - self.content = content - self.citations = citations - self.wcites = set() - self.pcites = set() - self.citedby = set() - - @staticmethod - def from_file_raw(raw_content): - """ - Parses the contents of a Lexipython source file into a LexiconArticle - object. If the source file is malformed, returns None. - """ - headers = raw_content.split('\n', 3) - if len(headers) != 4: - print("Header read error") - return None - author_header, turn_header, title_header, content_raw = headers - # Validate and sanitize the author header - if not author_header.startswith("# Author:"): - print("Author header missing") - return None - author = author_header[9:].strip() - # Validate and sanitize the turn header - if not turn_header.startswith("# Turn:"): - print("Turn header missing") - return None - turn = None - try: - turn = int(turn_header[7:].strip()) - except: - print("Turn header error") - return None - # Validate and sanitize the title header - if not title_header.startswith("# Title:"): - print("Title header missing") - return None - title = titlecase(title_header[8:]) - # Parse the content and extract citations - paras = re.split("\n\n+", content_raw.strip()) - content = "" - citations = {} - format_id = 1 - if not paras: - print("No content") - for para in paras: - # Escape angle brackets - para = re.sub("<", "<", para) - para = re.sub(">", ">", para) - # Replace bold and italic marks with tags - para = re.sub(r"//([^/]+)//", r"\1", para) - para = re.sub(r"\*\*([^*]+)\*\*", r"\1", para) - # Replace \\LF with
LF - para = re.sub(r"\\\\\n", "
\n", para) - # Abstract citations into the citation record - link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para) - while link_match: - # Identify the citation text and cited article - cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3) - cite_title = titlecase(link_match.group(3)) - # Record the citation - citations["c"+str(format_id)] = (cite_text, cite_title) - # Stitch the format id in place of the citation - para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):] - format_id += 1 # Increment to the next format citation - link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para) - # Convert signature to right-aligned - if para[:1] == '~': - para = "

" + para[1:] + "

\n" - else: - para = "

" + para + "

\n" - content += para - return LexiconArticle(author, turn, title, content, citations) - - def build_page_content(self): - """ - Formats citations into the article content as normal HTML links - and returns the result. - """ - format_map = { - format_id: "{0}".format( - cite_tuple[0], titleescape(cite_tuple[1]), - "" if cite_tuple[1] in self.wcites else " class=\"phantom\"") - for format_id, cite_tuple in self.citations.items() - } - return self.content.format(**format_map) - - def build_page_citeblock(self, prev_target, next_target): - """ - Builds the citeblock content HTML for use in regular article pages. - For each defined target, links the target page as Previous or Next. - """ - citeblock = "
\n" - # Prev/next links - if next_target is not None: - citeblock += "

Next →

\n".format(titleescape(next_target)) - if prev_target is not None: - citeblock += "

← Previous

\n".format(titleescape(prev_target)) - elif next_target is not None: - citeblock += "

 

\n" - # Citations - cites_links = [ - "{0}".format( - title, titleescape(title), - "" if title in self.wcites else " class=\"phantom\"") - for title in sorted(self.wcites | self.pcites)] - cites_str = " | ".join(cites_links) - if len(cites_str) < 1: cites_str = "--" - citeblock += "

Citations: {}

\n".format(cites_str) - # Citedby - citedby_links = [ - "{0}".format( - title, titleescape(title)) - for title in self.citedby] - citedby_str = " | ".join(citedby_links) - if len(citedby_str) < 1: citedby_str = "--" - citeblock += "

Cited by: {}

\n
\n".format(citedby_str) - return citeblock - -# Parsing functions for source intake - -def parse_from_directory(directory): - """ - Reads and parses each source file in the given directory. - Input: directory, the path to the folder to read - Output: a list of parsed articles - """ - articles = [] - print("Reading source files from", directory) - for filename in os.listdir(directory): - path = directory + filename - # Read only .txt files - if filename[-4:] == ".txt": - print(" Parsing", filename) - with open(path, "r", encoding="utf8") as src_file: - raw = src_file.read() - article = LexiconArticle.from_file_raw(raw) - if article is None: - print(" ERROR") - else: - print(" success:", article.title) - articles.append(article) - return articles - -def populate(lexicon_articles): - """ - Given a list of lexicon articles, fills out citation information - for each article and creates phantom pages for missing articles. - """ - article_by_title = {article.title : article for article in lexicon_articles} - # Determine all articles that exist or should exist - extant_titles = set([citation[1] for article in lexicon_articles for citation in article.citations]) - # Interlink all citations - for article in lexicon_articles: - for cite_tuple in article.citations.values(): - target = cite_tuple[1] - # Create article objects for phantom citations - if target not in article_by_title: - article_by_title[target] = LexiconArticle(None, sys.maxsize, target, "

This entry hasn't been written yet.

", {}) - # Interlink citations - if article_by_title[target].author is None: - article.pcites.add(target) - else: - article.wcites.add(target) - article_by_title[target].citedby.add(article.title) - return list(article_by_title.values()) - -# Build functions +import src.utils as utils def build_contents_page(articles, config): """ @@ -222,15 +25,15 @@ def build_contents_page(articles, config): "" if article.author is not None else " class=\"phantom\"") for article in articles} # Write the articles in alphabetical order - content += load_resource("contents.html") + content += utils.load_resource("contents.html") content += "
\n
\n" # Fill in the entry skeleton - entry_skeleton = load_resource("entry-page.html") - css = load_resource("lexicon.css") + entry_skeleton = utils.load_resource("entry-page.html") + css = utils.load_resource("lexicon.css") return entry_skeleton.format( title="Statistics", lexicon=config["LEXICON_TITLE"], @@ -435,82 +238,7 @@ def build_graphviz_file(cite_map): # Summative functions -def command_build(argv): - if len(argv) >= 3 and (argv[2] != "partial" and argv[2] != "full"): - print("unknown build type: " + argv[2]) - return - # Load content - config = load_config() - entry_skeleton = load_resource("entry-page.html") - css = load_resource("lexicon.css") - articles = [article for article in parse_from_directory("raw/") if article is not None] - written_titles = [article.title for article in articles] - articles = sorted(populate(articles), key=lambda a: (a.turn, a.title)) - #print(articles[13].title_filesafe) - #return - phantom_titles = [article.title for article in articles if article.title not in written_titles] - - # Write the redirect page - print("Writing redirect page...") - with open("out/index.html", "w", encoding="utf8") as f: - f.write(load_resource("redirect.html").format(lexicon=config["LEXICON_TITLE"])) - - # Write the article pages - print("Deleting old article pages...") - for filename in os.listdir("out/article/"): - if filename[-5:] == ".html": - os.remove("out/article/" + filename) - print("Writing article pages...") - l = len(articles) - for idx in range(l): - article = articles[idx] - with open("out/article/" + article.title_filesafe + ".html", "w", encoding="utf8") as f: - content = article.build_page_content() - citeblock = article.build_page_citeblock( - None if idx == 0 else articles[idx - 1].title, - None if idx == l-1 else articles[idx + 1].title) - article_html = entry_skeleton.format( - title = article.title, - lexicon = config["LEXICON_TITLE"], - css = css, - logo = config["LOGO_FILENAME"], - prompt = config["PROMPT"], - content = content, - citeblock = citeblock) - f.write(article_html) - print(" Wrote " + article.title) - - # Write default pages - print("Writing default pages...") - with open("out/contents/index.html", "w", encoding="utf8") as f: - f.write(build_contents_page(articles, config)) - print(" Wrote Contents") - with open("out/rules/index.html", "w", encoding="utf8") as f: - f.write(build_rules_page(config)) - print(" Wrote Rules") - with open("out/formatting/index.html", "w", encoding="utf8") as f: - f.write(build_formatting_page(config)) - print(" Wrote Formatting") - with open("out/session/index.html", "w", encoding="utf8") as f: - f.write(build_session_page(config)) - print(" Wrote Session") - with open("out/statistics/index.html", "w", encoding="utf8") as f: - f.write(build_statistics_page(articles, config)) - print(" Wrote Statistics") # Write auxiliary files # TODO: write graphviz file # TODO: write compiled lexicon page - -def main(): - if len(sys.argv) < 2: - print("Available commands:") - print(" - build [partial] : Build the lexicon and generate phantom stubs for all unwritten articles.") - print(" - build full : Build the lexicon and generate Ersatz pages for all unwritten articles.") - elif sys.argv[1] == "build": - command_build(sys.argv) - else: - print("Unknown command: " + sys.argv[1]) - -if __name__ == "__main__": - main()