############################### ## Lexipython Lexicon engine ## ############################### import sys # For argv and stderr import os # For reading directories import re # For parsing lex content import networkx # For pagerank analytics from collections import defaultdict # For rank inversion in statistics # Utility functions for handling titles and filenames def titlecase(s): s = s.strip() return s[:1].capitalize() + s[1:] def as_filename(s): """Makes a string filename-safe.""" # Strip out <, >, :, ", ', /, \, |, ?, and * s = re.sub(r"[<>:\"'/\\|?*]", '', s) # Strip out Unicode for - s = re.sub(r"[^\x00-\x7F]+", '-', s) # Strip out whitespace for _ s = re.sub(r"\s+", '_', s) return s def titlestrip(s): """Strips certain prefixes for title sorting.""" if s.startswith("The "): return s[4:] if s.startswith("An "): return s[3:] if s.startswith("A "): return s[2:] return s def cmp_title(x, y): """Compares strings in titular order, ignoring prefixed articles.""" return cmp(titlestrip(x), titlestrip(y)) def link_formatter(written_articles): """ Creates a lambda that formats citation links and handles the phantom class. Input: written_articles, a list of article titles to format as live links Output: a lambda (fid, alias, title) -> link_string """ return lambda fid, alias, title: "{0}".format( alias, as_filename(title), "" if title in written_articles else " class=\"phantom\"" ) # Parsing functions for source intake def parse_lex_header(header_para): """ Parses the header paragraph of a lex file. Input: header_para, raw header paragraph from the lex file Output: {"error": } if there was an error, otherwise {"title":
, "filename":
} """ # The title, which is also translated to the filename, heads the article after the # title_match = re.match("#(.+)", header_para) if not title_match: return {"error": "No match for title"} title = titlecase(title_match.group(1).strip()) if not title: return {"error": "Could not parse header as title"} return {"title": title, "filename": as_filename(title)} def parse_lex_content(paras): """ Parses the content paragraphs of a lex file. Input: paras, a list of raw paragraphs from the lex file Output: {"error": } if there was an error, otherwise {"content":
, "citations": {: (link text, link target)}} """ parsed = {"content": "", "citations": {}} format_id = 1 # Each citation will be ID'd by {c#} for formatting later for para in paras: # Escape angle brackets para = re.sub("<", "<", para) para = re.sub(">", ">", para) # Replace bold and italic marks with tags para = re.sub(r"\*\*([^*]+)\*\*", r"\1", para) para = re.sub(r"\/\/([^\/]+)\/\/", r"\1", para) # Replace \\LF with
LF para = re.sub(r"\\\\\n", "
\n", para) # Abstract citations into the citation record link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para) while link_match: # Identify the citation text and cited article cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3) cite_title = titlecase(link_match.group(3).strip()) # Record the citation parsed["citations"]["c"+str(format_id)] = (cite_text, cite_title) # Stitch the format id in place of the citation para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):] format_id += 1 # Increment to the next format citation link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para) # Convert signature to right-aligned if para[:1] == '~': para = "

" + para[1:] + "

\n" else: para = "

" + para + "

\n" parsed["content"] += para if not parsed["content"]: return {"error": "No content parsed"} return parsed def parse_lex(lex_contents): """ Parses the contents of a lex file into HTML and abstracts citations. Input: lex_contents, the read contents of a lex file Output: A dictionary in the following format: {"title":
, "filename":
, "content":
, "citations": {: (link text, link target)}} """ parsed_article = {} # Split the file into paragraphs paras = re.split("\n\n+", lex_contents) # Parse the title from the header title_parsed = parse_lex_header(paras.pop(0)) if "error" in title_parsed: return title_parsed parsed_article.update(title_parsed) # Parse each paragraph content_parsed = parse_lex_content(paras) if "error" in content_parsed: return content_parsed parsed_article.update(content_parsed) # Return the fully abstracted article return parsed_article def parse_lex_from_directory(directory): """ Reads and parses each lex file in the given directory. Input: directory, the path to the folder to read Output: a list of parsed lex file structures """ lexes = [] print "Reading lex files from", directory for filename in os.listdir(directory): path = directory + filename # Read only .lex files if path[-4:] == ".lex": print " Parsing", path, with open(path) as lex_file: lex_raw = lex_file.read() parsed_lex = parse_lex(lex_raw) if "error" in parsed_lex: print "ERROR:", parsed_lex["error"] else: print "SUCCESS:", parsed_lex["title"] lexes.append(parsed_lex) return lexes def load_resource(filename, cache={}): if filename not in cache: cache[filename] = open("resources/" + filename).read() return cache[filename] def load_config(): config = {} with open("lexicon.cfg") as f: line = f.readline() while line: # Skim lines until a value definition begins conf_match = re.match(">>>([^>]+)>>>\s+", line) if not conf_match: line = f.readline() continue # Accumulate the conf value until the value ends conf = conf_match.group(1) conf_value = "" line = f.readline() conf_match = re.match("<<<{0}<<<\s+".format(conf), line) while line and not conf_match: conf_value += line line = f.readline() conf_match = re.match("<<<{0}<<<\s+".format(conf), line) if not line: raise SystemExit("Reached EOF while reading config value {}".format(conf)) config[conf] = conf_value.strip() # Check that all necessary values were configured for config_value in ['LEXICON_TITLE', 'SIDEBAR_CONTENT', 'SESSION_PAGE', "INDEX_LIST"]: if config_value not in config: raise SystemExit("Error: {} not set in lexipython.cfg".format(config_value)) return config # Building functions for output def make_cite_map(lex_list): """ Compiles all citation information into a single map. Input: lex_list, a list of lex structures Output: a map from article titles to cited titles """ cite_map = {} for lex in lex_list: cited_titles = [cite_tuple[1] for format_id, cite_tuple in lex["citations"].items()] cite_map[lex["title"]] = sorted(set(cited_titles), cmp=cmp_title) return cite_map def format_content(lex, format_func): """ Formats citations into the lex content according to the provided function. Input: lex, a lex structure formatted_key, the key to store the formatted content under format_func, a function matching (fid, alias, dest) -> citation HTML Output: lex content formatted according to format_func """ format_map = { format_id: format_func(format_id, cite_tuple[0], cite_tuple[1]) for format_id, cite_tuple in lex["citations"].items() } return lex["content"].format(**format_map) def citation_lists(title, cite_map): """ Returns the citation lists for an article. Input: title, an article title cite_map, generated by make_cite_map Output: a list of cited article titles a list of titles of article citing this article """ citers = [citer_title for citer_title, cited_titles in cite_map.items() if title in cited_titles] return cite_map[title], citers def build_article_page(lex, cite_map, config): """ Builds the full HTML of an article page. Input: lex, a lex structure cite_map, generated by make_cite_map config, a dict of config values Output: the full HTML as a string """ lf = link_formatter(cite_map.keys()) # Build the article content content = format_content(lex, lf) # Build the article citeblock cites, citedby = citation_lists(lex["title"], cite_map) cites_str = " | ".join([lf(None, title, title) for title in cites]) citedby_str = " | ".join([lf(None, title, title) for title in citedby]) citeblock = ""\ "
\n"\ "

Citations: {cites}

\n"\ "

Cited by: {citedby}

\n"\ "
\n".format( cites=cites_str, citedby=citedby_str) # Fill in the entry skeleton entry_skeleton = load_resource("entry-page.html") css = load_resource("lexicon.css") return entry_skeleton.format( title=lex["title"], lexicon=config["LEXICON_TITLE"], css=css, logo=config["LOGO_FILENAME"], sidebar=config["SIDEBAR_HTML"], content=content, citeblock=citeblock) def build_phantom_page(title, cite_map, config): """ Builds the full HTML of a phantom page. Input: title, the phantom title cite_map, generated by make_cite_map config, a dict of config values Output: the full HTML as a string """ lf = link_formatter(cite_map.keys()) # Fill in the content with filler content = "

This entry hasn't been written yet.

" # Build the stub citeblock cites, citedby = citation_lists(title, cite_map) citedby_str = " | ".join([lf(None, title, title) for title in citedby]) citeblock = ""\ "
\n"\ "

Cited by: {citedby}

\n"\ "
\n".format( citedby=citedby_str) # Fill in the entry skeleton entry_skeleton = load_resource("entry-page.html") css = load_resource("lexicon.css") return entry_skeleton.format( title=title, lexicon=config["LEXICON_TITLE"], css=css, logo=config["LOGO_FILENAME"], sidebar=config["SIDEBAR_HTML"], content=content, citeblock=citeblock) def build_stub_page(title, cite_map, config): """ Builds the full HTML of a stub page. Input: title, the stub title cite_map, generated by make_cite_map config, a dict of config values Output: the full HTML as a string """ lf = link_formatter(cite_map.keys()) # Fill in the content with filler content = "

[The handwriting is completely illegible.]

\n"\ "

Ersatz Scrivener

\n" # Build the stub citeblock citedby = [citer_title for citer_title, cited_titles in cite_map.items() if title in cited_titles] citedby_str = " | ".join([lf(None, title, title) for title in citedby]) citeblock = ""\ "
\n"\ "

Citations: [Illegible]

\n"\ "

Cited by: {citedby}

\n"\ "
\n".format( citedby=citedby_str) # Fill in the entry skeleton entry_skeleton = load_resource("entry-page.html") css = load_resource("lexicon.css") return entry_skeleton.format( title=title, lexicon=config["LEXICON_TITLE"], css=css, logo=config["LOGO_FILENAME"], sidebar=config["SIDEBAR_HTML"], content=content, citeblock=citeblock) def build_index_page(cite_map, config): """ Builds the full HTML of the index page. Input: cite_map, generated by make_cite_map config, a dict of config values Output: the HTML of the index page """ # Count up all the titles titles = sorted( (set(cite_map.keys()) | set([title for cited_titles in cite_map.values() for title in cited_titles])) , cmp=cmp_title) content = "" if len(titles) == len(cite_map.keys()): content = "

There are {0} entries in this lexicon.

\n
    \n".format(len(titles)) else: content = "

    There are {0} entries, {1} written and {2} phantom.

    \n
      \n".format( len(titles), len(cite_map.keys()), len(titles) - len(cite_map.keys())) # Write all of the entries out as links under their indices lf = link_formatter(cite_map.keys()) indices = config["INDEX_LIST"].split("\n") for index_str in indices: content += "

      {0}

      ".format(index_str) index_titles = [] for c in index_str.upper(): for title in titles: if (titlestrip(title)[0] == c): index_titles.append(title) for title in index_titles: titles.remove(title) content += "
    • " content += lf(None, title, title) content += "
    • \n" if len(titles) > 0: content += "

      &c.

      ".format(index_str) for title in titles: content += "
    • " content += lf(None, title, title) content += "
    • \n" content += "
    \n" # Fill in the entry skeleton entry_skeleton = load_resource("entry-page.html") css = load_resource("lexicon.css") return entry_skeleton.format( title="Index of " + config["LEXICON_TITLE"], lexicon=config["LEXICON_TITLE"], css=css, logo=config["LOGO_FILENAME"], sidebar=config["SIDEBAR_HTML"], content=content, citeblock="") def build_rules_page(config): """ Builds the full HTML of the rules page. Input: config, a dict of config values Output: the HTML of the rules page """ content = load_resource("rules.html") # Fill in the entry skeleton entry_skeleton = load_resource("entry-page.html") css = load_resource("lexicon.css") return entry_skeleton.format( title="Rules", lexicon=config["LEXICON_TITLE"], css=css, logo=config["LOGO_FILENAME"], sidebar=config["SIDEBAR_HTML"], content=content, citeblock="") def build_formatting_page(config): """ Builds the full HTML of the formatting page. Input: config, a dict of config values Output: the HTML of the formatting page """ content = load_resource("formatting.html") # Fill in the entry skeleton entry_skeleton = load_resource("entry-page.html") css = load_resource("lexicon.css") return entry_skeleton.format( title="Formatting", lexicon=config["LEXICON_TITLE"], css=css, logo=config["LOGO_FILENAME"], sidebar=config["SIDEBAR_HTML"], content=content, citeblock="") def build_session_page(config): """ Builds the full HTML of the session page. Input: config, a dict of config values Output: the HTML of the session page """ session_lex = parse_lex(config["SESSION_PAGE"]) content = format_content(session_lex, lambda fid,alias,dest: "" + alias + "") # Fill in the entry skeleton entry_skeleton = load_resource("entry-page.html") css = load_resource("lexicon.css") return entry_skeleton.format( title=session_lex["title"], lexicon=config["LEXICON_TITLE"], css=css, logo=config["LOGO_FILENAME"], sidebar=config["SIDEBAR_HTML"], content=content, citeblock="") def build_statistics_page(cite_map, config): """ Builds the full HTML of the statistics page. Input: citation_map, a dictionary returned by make_cite_map config, a dict of config values Output: the HTML of the statistics page """ content = "" # Compute the pagerank content += "

    Top 10 by page rank:
    \n" G = networkx.Graph() for citer, citeds in cite_map.items(): for cited in citeds: G.add_edge(citer, cited) ranks = networkx.pagerank(G) sranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True) ranking = list(enumerate(map(lambda x: x[0], sranks))) content += "
    \n".join(map(lambda x: "{0} - {1}".format(x[0]+1, x[1]), ranking[:10])) content += "

    \n" # Count the top number of citations made from content += "

    Most citations made from:
    \n" citation_tally = [(kv[0], len(kv[1])) for kv in cite_map.items()] citation_count = defaultdict(list) for title, count in citation_tally: citation_count[count].append(title) content += "
    \n".join(map( lambda kv: "{0} - {1}".format(kv[0], "; ".join(kv[1])), sorted(citation_count.items(), reverse=True)[:3])) content += "

    \n" # Count the top number of citations made to content += "

    Most citations made to:
    \n" all_cited = set([title for cites in cite_map.values() for title in cites]) cited_by_map = { cited: [citer for citer in cite_map.keys() if cited in cite_map[citer]] for cited in all_cited } cited_tally = [(kv[0], len(kv[1])) for kv in cited_by_map.items()] cited_count = defaultdict(list) for title, count in cited_tally: cited_count[count].append(title) content += "
    \n".join(map( lambda kv: "{0} - {1}".format(kv[0], "; ".join(kv[1])), sorted(cited_count.items(), reverse=True)[:3])) #cited_count = map(lambda kv: (kv[0], len(kv[1])), cited_by_map.items()) #cited_count_sort = sorted(cited_count, key=lambda x: x[1], reverse=True) #top_cited_count = [kv for kv in cited_count_sort if kv[1] >= cited_count_sort[:5][-1][1]] #content += "
    \n".join(map(lambda x: "{0} - {1}".format(x[1], x[0]), top_cited_count)) content += "

    \n" # Fill in the entry skeleton entry_skeleton = load_resource("entry-page.html") css = load_resource("lexicon.css") return entry_skeleton.format( title="Statistics", lexicon=config["LEXICON_TITLE"], css=css, logo=config["LOGO_FILENAME"], sidebar=config["SIDEBAR_HTML"], content=content, citeblock="") # Summative functions def command_build(argv): if len(argv) >= 3 and (argv[2] != "partial" and argv[2] != "full"): print "unknown build type: " + argv[2] return # Set up the entries config = load_config() sidebar_parsed = parse_lex(config["SIDEBAR_CONTENT"]) config["SIDEBAR_HTML"] = format_content(sidebar_parsed, lambda fid,alias,dest: alias) lexes = parse_lex_from_directory("raw/") cite_map = make_cite_map(lexes) written_entries = cite_map.keys() phantom_entries = set([title for cites in cite_map.values() for title in cites if title not in written_entries]) # Clear the folder print "Clearing old HTML files" for filename in os.listdir("out/"): if filename[-5:] == ".html": os.remove("out/" + filename) # Write the written entries print "Writing written articles..." for lex in lexes: page = build_article_page(lex, cite_map, config) with open("out/" + lex["filename"] + ".html", "w") as f: f.write(page) print " Wrote " + lex["title"] # Write the unwritten entries if len(phantom_entries) > 0: if len(argv) < 3 or argv[2] == "partial": print "Writing phantom articles..." for title in phantom_entries: page = build_phantom_page(title, cite_map, config) with open("out/" + as_filename(title) + ".html", "w") as f: f.write(page) print " Wrote " + title elif argv[2] == "full": print "Writing stub articles..." for title in phantom_entries: page = build_stub_page(title, cite_map, config) with open("out/" + as_filename(title) + ".html", "w") as f: f.write(page) print " Wrote " + title else: print "ERROR: build type was " + argv[2] return # Write the default pages print "Writing default pages" page = build_rules_page(config) with open("out/rules.html", "w") as f: f.write(page) print " Wrote Rules" page = build_formatting_page(config) with open("out/formatting.html", "w") as f: f.write(page) print " Wrote Formatting" page = build_index_page(cite_map, config) with open("out/index.html", "w") as f: f.write(page) print " Wrote Index" page = build_session_page(config) with open("out/session.html", "w") as f: f.write(page) print " Wrote Session" page = build_statistics_page(cite_map, config) with open("out/stats.html", "w") as f: f.write(page) print " Wrote Statistics" def main(): if len(sys.argv) < 2: print "Available commands:" print " - build [partial] : Build the lexicon and generate phantom stubs for all unwritten articles." print " - build full : Build the lexicon and generate Ersatz pages for all unwritten articles." elif sys.argv[1] == "build": command_build(sys.argv) else: print "Unknown command: " + sys.argv[1] if __name__ == "__main__": main()