diff --git a/lexipython.py b/lexipython.py new file mode 100644 index 0000000..c4a2f19 --- /dev/null +++ b/lexipython.py @@ -0,0 +1,575 @@ +############################### +## Lexipython Lexicon engine ## +############################### + +import sys # For argv and stderr +import os # For reading directories +import re # For parsing lex content +import networkx # For pagerank analytics +from collections import defaultdict # For rank inversion in statistics + +# Utility functions for handling titles and filenames + +def titlecase(s): + s = s.strip() + return s[:1].capitalize() + s[1:] + +def as_filename(s): + """Makes a string filename-safe.""" + # Strip out <, >, :, ", ', /, \, |, ?, and * + s = re.sub(r"[<>:\"'/\\|?*]", '', s) + # Strip out Unicode for - + s = re.sub(r"[^\x00-\x7F]+", '-', s) + # Strip out whitespace for _ + s = re.sub(r"\s+", '_', s) + return s + +def titlestrip(s): + """Strips certain prefixes for title sorting.""" + if s.startswith("The "): return s[4:] + if s.startswith("An "): return s[3:] + if s.startswith("A "): return s[2:] + return s + +def cmp_title(x, y): + """Compares strings in titular order, ignoring prefixed articles.""" + return cmp(titlestrip(x), titlestrip(y)) + +def link_formatter(written_articles): + """ + Creates a lambda that formats citation links and handles the phantom class. + Input: written_articles, a list of article titles to format as live links + Output: a lambda (fid, alias, title) -> link_string + """ + return lambda fid, alias, title: "{0}".format( + alias, as_filename(title), + "" if title in written_articles else " class=\"phantom\"" + ) + +# Parsing functions for source intake + +def parse_lex_header(header_para): + """ + Parses the header paragraph of a lex file. + Input: header_para, raw header paragraph from the lex file + Output: {"error": } if there was an error, otherwise + {"title":
, "filename":
} + """ + # The title, which is also translated to the filename, heads the article after the # + title_match = re.match("#(.+)", header_para) + if not title_match: + return {"error": "No match for title"} + title = titlecase(title_match.group(1).strip()) + if not title: + return {"error": "Could not parse header as title"} + return {"title": title, "filename": as_filename(title)} + +def parse_lex_content(paras): + """ + Parses the content paragraphs of a lex file. + Input: paras, a list of raw paragraphs from the lex file + Output: {"error": } if there was an error, otherwise + {"content":
, + "citations": {: (link text, link target)}} + """ + parsed = {"content": "", "citations": {}} + format_id = 1 # Each citation will be ID'd by {c#} for formatting later + for para in paras: + # Escape angle brackets + para = re.sub("<", "<", para) + para = re.sub(">", ">", para) + # Replace bold and italic marks with tags + para = re.sub(r"\*\*([^*]+)\*\*", r"\1", para) + para = re.sub(r"\/\/([^\/]+)\/\/", r"\1", para) + # Replace \\LF with
LF + para = re.sub(r"\\\\\n", "
\n", para) + # Abstract citations into the citation record + link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para) + while link_match: + # Identify the citation text and cited article + cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3) + cite_title = titlecase(link_match.group(3).strip()) + # Record the citation + parsed["citations"]["c"+str(format_id)] = (cite_text, cite_title) + # Stitch the format id in place of the citation + para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):] + format_id += 1 # Increment to the next format citation + link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para) + # Convert signature to right-aligned + if para[:1] == '~': + para = "

" + para[1:] + "

\n" + else: + para = "

" + para + "

\n" + parsed["content"] += para + if not parsed["content"]: + return {"error": "No content parsed"} + return parsed + +def parse_lex(lex_contents): + """ + Parses the contents of a lex file into HTML and abstracts citations. + Input: lex_contents, the read contents of a lex file + Output: A dictionary in the following format: + {"title":
, "filename":
, + "content":
, + "citations": {: (link text, link target)}} + """ + parsed_article = {} + # Split the file into paragraphs + paras = re.split("\n\n+", lex_contents) + # Parse the title from the header + title_parsed = parse_lex_header(paras.pop(0)) + if "error" in title_parsed: + return title_parsed + parsed_article.update(title_parsed) + # Parse each paragraph + content_parsed = parse_lex_content(paras) + if "error" in content_parsed: + return content_parsed + parsed_article.update(content_parsed) + # Return the fully abstracted article + return parsed_article + +def parse_lex_from_directory(directory): + """ + Reads and parses each lex file in the given directory. + Input: directory, the path to the folder to read + Output: a list of parsed lex file structures + """ + lexes = [] + print "Reading lex files from", directory + for filename in os.listdir(directory): + path = directory + filename + # Read only .lex files + if path[-4:] == ".lex": + print " Parsing", path, + with open(path) as lex_file: + lex_raw = lex_file.read() + parsed_lex = parse_lex(lex_raw) + if "error" in parsed_lex: + print "ERROR:", parsed_lex["error"] + else: + print "SUCCESS:", parsed_lex["title"] + lexes.append(parsed_lex) + return lexes + +def load_resource(filename, cache={}): + if filename not in cache: + cache[filename] = open("resources/" + filename).read() + return cache[filename] + +def load_config(): + config = {} + with open("lexicon.cfg") as f: + line = f.readline() + while line: + # Skim lines until a value definition begins + conf_match = re.match(">>>([^>]+)>>>\s+", line) + if not conf_match: + line = f.readline() + continue + # Accumulate the conf value until the value ends + conf = conf_match.group(1) + conf_value = "" + line = f.readline() + conf_match = re.match("<<<{0}<<<\s+".format(conf), line) + while line and not conf_match: + conf_value += line + line = f.readline() + conf_match = re.match("<<<{0}<<<\s+".format(conf), line) + if not line: + raise SystemExit("Reached EOF while reading config value {}".format(conf)) + config[conf] = conf_value.strip() + # Check that all necessary values were configured + for config_value in ['LEXICON_TITLE', 'SIDEBAR_CONTENT', 'SESSION_PAGE', "INDEX_LIST"]: + if config_value not in config: + raise SystemExit("Error: {} not set in lexipython.cfg".format(config_value)) + return config + +# Building functions for output + +def make_cite_map(lex_list): + """ + Compiles all citation information into a single map. + Input: lex_list, a list of lex structures + Output: a map from article titles to cited titles + """ + cite_map = {} + for lex in lex_list: + cited_titles = [cite_tuple[1] for format_id, cite_tuple in lex["citations"].items()] + cite_map[lex["title"]] = sorted(set(cited_titles), cmp=cmp_title) + return cite_map + +def format_content(lex, format_func): + """ + Formats citations into the lex content according to the provided function. + Input: lex, a lex structure + formatted_key, the key to store the formatted content under + format_func, a function matching (fid, alias, dest) -> citation HTML + Output: lex content formatted according to format_func + """ + format_map = { + format_id: format_func(format_id, cite_tuple[0], cite_tuple[1]) + for format_id, cite_tuple in lex["citations"].items() + } + return lex["content"].format(**format_map) + +def citation_lists(title, cite_map): + """ + Returns the citation lists for an article. + Input: title, an article title + cite_map, generated by make_cite_map + Output: a list of cited article titles + a list of titles of article citing this article + """ + citers = [citer_title + for citer_title, cited_titles in cite_map.items() + if title in cited_titles] + return cite_map[title], citers + +def build_article_page(lex, cite_map, config): + """ + Builds the full HTML of an article page. + Input: lex, a lex structure + cite_map, generated by make_cite_map + config, a dict of config values + Output: the full HTML as a string + """ + lf = link_formatter(cite_map.keys()) + # Build the article content + content = format_content(lex, lf) + # Build the article citeblock + cites, citedby = citation_lists(lex["title"], cite_map) + cites_str = " | ".join([lf(None, title, title) for title in cites]) + citedby_str = " | ".join([lf(None, title, title) for title in citedby]) + citeblock = ""\ + "
\n"\ + "

Citations: {cites}

\n"\ + "

Cited by: {citedby}

\n"\ + "
\n".format( + cites=cites_str, + citedby=citedby_str) + # Fill in the entry skeleton + entry_skeleton = load_resource("entry-page.html") + css = load_resource("lexicon.css") + return entry_skeleton.format( + title=lex["title"], + lexicon=config["LEXICON_TITLE"], + css=css, + logo=config["LOGO_FILENAME"], + sidebar=config["SIDEBAR_HTML"], + content=content, + citeblock=citeblock) + +def build_phantom_page(title, cite_map, config): + """ + Builds the full HTML of a phantom page. + Input: title, the phantom title + cite_map, generated by make_cite_map + config, a dict of config values + Output: the full HTML as a string + """ + lf = link_formatter(cite_map.keys()) + # Fill in the content with filler + content = "

This entry hasn't been written yet.

" + # Build the stub citeblock + cites, citedby = citation_lists(title, cite_map) + citedby_str = " | ".join([lf(None, title, title) for title in citedby]) + citeblock = ""\ + "
\n"\ + "

Cited by: {citedby}

\n"\ + "
\n".format( + cites=cites_str, + citedby=citedby_str) + # Fill in the entry skeleton + entry_skeleton = load_resource("entry-page.html") + css = load_resource("lexicon.css") + return entry_skeleton.format( + title=lex["title"], + lexicon=config["LEXICON_TITLE"], + css=css, + logo=config["LOGO_FILENAME"], + sidebar=config["SIDEBAR_HTML"], + content=content, + citeblock=citeblock) + +def build_stub_page(title, cite_map, config): + """ + Builds the full HTML of a stub page. + Input: title, the stub title + cite_map, generated by make_cite_map + config, a dict of config values + Output: the full HTML as a string + """ + lf = link_formatter(cite_map.keys()) + # Fill in the content with filler + content = "

[The handwriting is completely illegible.]

\n"\ + "

Ersatz Scrivener

\n" + # Build the stub citeblock + cites, citedby = citation_lists(title, cite_map) + citedby_str = " | ".join([lf(None, title, title) for title in citedby]) + citeblock = ""\ + "
\n"\ + "

Citations: [Illegible]

\n"\ + "

Cited by: {citedby}

\n"\ + "
\n".format( + cites=cites_str, + citedby=citedby_str) + # Fill in the entry skeleton + entry_skeleton = load_resource("entry-page.html") + css = load_resource("lexicon.css") + return entry_skeleton.format( + title=lex["title"], + lexicon=config["LEXICON_TITLE"], + css=css, + logo=config["LOGO_FILENAME"], + sidebar=config["SIDEBAR_HTML"], + content=content, + citeblock=citeblock) + +def build_index_page(cite_map, config): + """ + Builds the full HTML of the index page. + Input: cite_map, generated by make_cite_map + config, a dict of config values + Output: the HTML of the index page + """ + # Count up all the titles + titles = sorted( + (set(cite_map.keys()) | + set([title for cited_titles in cite_map.values() for title in cited_titles])) + , cmp=cmp_title) + content = "" + if len(titles) == len(cite_map.keys()): + content = "

There are {0} entries in this lexicon.

\n
    \n".format(len(titles)) + else: + content = "

    There are {0} entries, {1} written and {2} phantom.

    \n
      \n".format( + len(titles), len(cite_map.keys()), len(titles) - len(cite_map.keys())) + # Write all of the entries out as links under their indices + lf = link_formatter(cite_map.keys()) + indices = config["INDEX_LIST"].split("\n") + for index_str in indices: + content += "

      {0}

      ".format(index_str) + index_titles = [] + for c in index_str.upper(): + for title in titles: + if (titlestrip(title)[0] == c): + index_titles.append(title) + for title in index_titles: + titles.remove(title) + content += "
    • " + content += lf(None, title, title) + content += "
    • \n" + if len(titles) > 0: + content += "

      &c.

      ".format(index_str) + for title in titles: + content += "
    • " + content += lf(None, title, title) + content += "
    • \n" + content += "
    \n" + # Fill in the entry skeleton + entry_skeleton = load_resource("entry-page.html") + css = load_resource("lexicon.css") + return entry_skeleton.format( + title="Index of " + config["LEXICON_TITLE"], + lexicon=config["LEXICON_TITLE"], + css=css, + logo=config["LOGO_FILENAME"], + sidebar=config["SIDEBAR_HTML"], + content=content, + citeblock="") + +def build_rules_page(config): + """ + Builds the full HTML of the rules page. + Input: config, a dict of config values + Output: the HTML of the rules page + """ + content = load_resource("rules.html") + # Fill in the entry skeleton + entry_skeleton = load_resource("entry-page.html") + css = load_resource("lexicon.css") + return entry_skeleton.format( + title="Rules", + lexicon=config["LEXICON_TITLE"], + css=css, + logo=config["LOGO_FILENAME"], + sidebar=config["SIDEBAR_HTML"], + content=content, + citeblock="") + +def build_formatting_page(config): + """ + Builds the full HTML of the formatting page. + Input: config, a dict of config values + Output: the HTML of the formatting page + """ + content = load_resource("formatting.html") + # Fill in the entry skeleton + entry_skeleton = load_resource("entry-page.html") + css = load_resource("lexicon.css") + return entry_skeleton.format( + title="Formatting", + lexicon=config["LEXICON_TITLE"], + css=css, + logo=config["LOGO_FILENAME"], + sidebar=config["SIDEBAR_HTML"], + content=content, + citeblock="") + +def build_session_page(config): + """ + Builds the full HTML of the session page. + Input: config, a dict of config values + Output: the HTML of the session page + """ + session_lex = parse_lex(config["SESSION_PAGE"]) + content = format_content(session_lex, lambda fid,alias,dest: "" + alias + "") + # Fill in the entry skeleton + entry_skeleton = load_resource("entry-page.html") + css = load_resource("lexicon.css") + return entry_skeleton.format( + title=session_lex["title"], + lexicon=config["LEXICON_TITLE"], + css=css, + logo=config["LOGO_FILENAME"], + sidebar=config["SIDEBAR_HTML"], + content=content, + citeblock="") + +def build_statistics_page(cite_map, config): + """ + Builds the full HTML of the statistics page. + Input: citation_map, a dictionary returned by make_cite_map + config, a dict of config values + Output: the HTML of the statistics page + """ + content = "" + # Compute the pagerank + content += "

    Top 10 by page rank:
    \n" + G = networkx.Graph() + for citer, citeds in cite_map.items(): + for cited in citeds: + G.add_edge(citer, cited) + ranks = networkx.pagerank(G) + sranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True) + ranking = list(enumerate(map(lambda x: x[0], sranks))) + content += "
    \n".join(map(lambda x: "{0} - {1}".format(x[0]+1, x[1]), ranking[:10])) + content += "

    \n" + # Count the top number of citations made from + content += "

    Most citations made from:
    \n" + citation_tally = [(kv[0], len(kv[1])) for kv in cite_map.items()] + citation_count = defaultdict(list) + for title, count in citation_tally: citation_count[count].append(title) + content += "
    \n".join(map( + lambda kv: "{0} - {1}".format(kv[0], "; ".join(kv[1])), + sorted(citation_count.items(), reverse=True)[:3])) + content += "

    \n" + # Count the top number of citations made to + content += "

    Most citations made to:
    \n" + all_cited = set([title for cites in cite_map.values() for title in cites]) + cited_by_map = { cited: [citer for citer in cite_map.keys() if cited in cite_map[citer]] for cited in all_cited } + cited_tally = [(kv[0], len(kv[1])) for kv in cited_by_map.items()] + cited_count = defaultdict(list) + for title, count in cited_tally: cited_count[count].append(title) + content += "
    \n".join(map( + lambda kv: "{0} - {1}".format(kv[0], "; ".join(kv[1])), + sorted(cited_count.items(), reverse=True)[:3])) + #cited_count = map(lambda kv: (kv[0], len(kv[1])), cited_by_map.items()) + #cited_count_sort = sorted(cited_count, key=lambda x: x[1], reverse=True) + #top_cited_count = [kv for kv in cited_count_sort if kv[1] >= cited_count_sort[:5][-1][1]] + #content += "
    \n".join(map(lambda x: "{0} - {1}".format(x[1], x[0]), top_cited_count)) + content += "

    \n" + # Fill in the entry skeleton + entry_skeleton = load_resource("entry-page.html") + css = load_resource("lexicon.css") + return entry_skeleton.format( + title="Statistics", + lexicon=config["LEXICON_TITLE"], + css=css, + logo=config["LOGO_FILENAME"], + sidebar=config["SIDEBAR_HTML"], + content=content, + citeblock="") + +# Summative functions + +def command_build(argv): + if len(argv) >= 3 and (argv[2] != "partial" and argv[2] != "full"): + print "unknown build type: " + argv[2] + return + # Set up the entries + config = load_config() + sidebar_parsed = parse_lex(config["SIDEBAR_CONTENT"]) + config["SIDEBAR_HTML"] = format_content(sidebar_parsed, lambda fid,alias,dest: alias) + lexes = parse_lex_from_directory("raw/") + cite_map = make_cite_map(lexes) + written_entries = cite_map.keys() + phantom_entries = set([title for cites in cite_map.values() for title in cites if title not in written_entries]) + # Clear the folder + print "Clearing old HTML files" + for filename in os.listdir("out/"): + if filename[-5:] == ".html": + print filename + os.remove("out/" + filename) + # Write the written entries + print "Writing written articles..." + for lex in lexes: + page = build_article_page(lex, cite_map, config) + with open("out/" + lex["filename"] + ".html", "w") as f: + f.write(page) + print " Wrote " + lex["title"] + # Write the unwritten entries + if len(phantom_entries) > 0: + if len(argv) < 3 or argv[2] == "partial": + print "Writing phantom articles..." + for title in phantom_entries: + page = build_phantom_page(title, cite_map, config) + with open("out/" + as_filename(title) + ".html", "w") as f: + f.write(page) + print " Wrote " + title + elif argv[2] == "full": + print "Writing stub articles..." + for title in phantom_entries: + page = build_stub_page(title, cite_map, config) + with open("out/" + as_filename(title) + ".html", "w") as f: + f.write(page) + print " Wrote " + title + else: + print "ERROR: build type was " + argv[2] + return + # Write the default pages + print "Writing default pages" + page = build_rules_page(config) + with open("out/rules.html", "w") as f: + f.write(page) + print " Wrote Rules" + page = build_formatting_page(config) + with open("out/formatting.html", "w") as f: + f.write(page) + print " Wrote Formatting" + page = build_index_page(cite_map, config) + with open("out/index.html", "w") as f: + f.write(page) + print " Wrote Index" + page = build_session_page(config) + with open("out/session.html", "w") as f: + f.write(page) + print " Wrote Session" + page = build_statistics_page(cite_map, config) + with open("out/stats.html", "w") as f: + f.write(page) + print " Wrote Statistics" + +def main(): + if len(sys.argv) < 2: + print "Available commands:" + print " - build [partial] : Build the lexicon and generate phantom stubs for all unwritten articles." + print " - build full : Build the lexicon and generate Ersatz pages for all unwritten articles." + elif sys.argv[1] == "build": + command_build(sys.argv) + else: + print "Unknown command: " + sys.argv[1] + +if __name__ == "__main__": + main()