Add main code file

2017-08-27 00:14:16 -05:00 · 2017-08-27 00:14:16 -05:00 · d77870534a
commit d77870534a
parent ef354f6950
1 changed files with 575 additions and 0 deletions
--- a/lexipython.py
+++ b/lexipython.py
@ -0,0 +1,575 @@
+###############################
+## Lexipython Lexicon engine ##
+###############################
+
+import sys		# For argv and stderr
+import os		# For reading directories
+import re		# For parsing lex content
+import networkx # For pagerank analytics
+from collections import defaultdict # For rank inversion in statistics
+
+# Utility functions for handling titles and filenames
+
+def titlecase(s):
+	s = s.strip()
+	return s[:1].capitalize() + s[1:]
+
+def as_filename(s):
+	"""Makes a string filename-safe."""
+	# Strip out <, >, :, ", ', /, \, |, ?, and *
+	s = re.sub(r"[<>:\"'/\\|?*]", '', s)
+	# Strip out Unicode for -
+	s = re.sub(r"[^\x00-\x7F]+", '-', s)
+	# Strip out whitespace for _
+	s = re.sub(r"\s+", '_', s)
+	return s
+
+def titlestrip(s):
+	"""Strips certain prefixes for title sorting."""
+	if s.startswith("The "): return s[4:]
+	if s.startswith("An "): return s[3:]
+	if s.startswith("A "): return s[2:]
+	return s
+
+def cmp_title(x, y):
+	"""Compares strings in titular order, ignoring prefixed articles."""
+	return cmp(titlestrip(x), titlestrip(y))
+
+def link_formatter(written_articles):
+	"""
+	Creates a lambda that formats citation links and handles the phantom class.
+	Input:  written_articles, a list of article titles to format as live links
+	Output: a lambda (fid, alias, title) -> link_string
+	"""
+	return lambda fid, alias, title: "<a href=\"{1}.html\"{2}>{0}</a>".format(
+			alias, as_filename(title),
+			"" if title in written_articles else " class=\"phantom\""
+		)
+
+# Parsing functions for source intake
+
+def parse_lex_header(header_para):
+	"""
+	Parses the header paragraph of a lex file.
+	Input:  header_para, raw header paragraph from the lex file
+	Output: {"error": <error message>} if there was an error, otherwise
+	        {"title": <article title>, "filename": <article filename>}
+	"""
+	# The title, which is also translated to the filename, heads the article after the #
+	title_match = re.match("#(.+)", header_para)
+	if not title_match:
+		return {"error": "No match for title"}
+	title = titlecase(title_match.group(1).strip())
+	if not title:
+		return {"error": "Could not parse header as title"}
+	return {"title": title, "filename": as_filename(title)}
+
+def parse_lex_content(paras):
+	"""
+	Parses the content paragraphs of a lex file.
+	Input:  paras, a list of raw paragraphs from the lex file
+	Output: {"error": <error message>} if there was an error, otherwise
+	        {"content": <article HTML content>,
+			 "citations": {<format id>: (link text, link target)}}
+	"""
+	parsed = {"content": "", "citations": {}}
+	format_id = 1 # Each citation will be ID'd by {c#} for formatting later
+	for para in paras:
+		# Escape angle brackets
+		para = re.sub("<", "&lt;", para)
+		para = re.sub(">", "&gt;", para)
+		# Replace bold and italic marks with tags
+		para = re.sub(r"\*\*([^*]+)\*\*", r"<b>\1</b>", para)
+		para = re.sub(r"\/\/([^\/]+)\/\/", r"<i>\1</i>", para)
+		# Replace \\LF with <br>LF
+		para = re.sub(r"\\\\\n", "<br>\n", para)
+		# Abstract citations into the citation record
+		link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para)
+		while link_match:
+			# Identify the citation text and cited article
+			cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3)
+			cite_title = titlecase(link_match.group(3).strip())
+			# Record the citation
+			parsed["citations"]["c"+str(format_id)] = (cite_text, cite_title)
+			# Stitch the format id in place of the citation
+			para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):]
+			format_id += 1 # Increment to the next format citation
+			link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para)
+		# Convert signature to right-aligned
+		if para[:1] == '~':
+			para = "<hr><span class=\"signature\"><p>" + para[1:] + "</p></span>\n"
+		else:
+			para = "<p>" + para + "</p>\n"
+		parsed["content"] += para
+	if not parsed["content"]:
+		return {"error": "No content parsed"}
+	return parsed
+
+def parse_lex(lex_contents):
+	"""
+	Parses the contents of a lex file into HTML and abstracts citations.
+	Input:  lex_contents, the read contents of a lex file
+	Output: A dictionary in the following format:
+	{"title": <article title>, "filename": <article filename>,
+	 "content": <article HTML content>, 
+	 "citations": {<format id>: (link text, link target)}}
+	"""
+	parsed_article = {}
+	# Split the file into paragraphs
+	paras = re.split("\n\n+", lex_contents)
+	# Parse the title from the header
+	title_parsed = parse_lex_header(paras.pop(0))
+	if "error" in title_parsed:
+		return title_parsed
+	parsed_article.update(title_parsed)
+	# Parse each paragraph
+	content_parsed = parse_lex_content(paras)
+	if "error" in content_parsed:
+		return content_parsed
+	parsed_article.update(content_parsed)
+	# Return the fully abstracted article
+	return parsed_article
+
+def parse_lex_from_directory(directory):
+	"""
+	Reads and parses each lex file in the given directory.
+	Input:  directory, the path to the folder to read
+	Output: a list of parsed lex file structures
+	"""
+	lexes = []
+	print "Reading lex files from", directory
+	for filename in os.listdir(directory):
+		path = directory + filename
+		# Read only .lex files
+		if path[-4:] == ".lex":
+			print "    Parsing", path,
+			with open(path) as lex_file:
+				lex_raw = lex_file.read()
+				parsed_lex = parse_lex(lex_raw)
+				if "error" in parsed_lex:
+					print "ERROR:", parsed_lex["error"]
+				else:
+					print "SUCCESS:", parsed_lex["title"]
+					lexes.append(parsed_lex)
+	return lexes
+
+def load_resource(filename, cache={}):
+	if filename not in cache:
+		cache[filename] = open("resources/" + filename).read()
+	return cache[filename]
+
+def load_config():
+	config = {}
+	with open("lexicon.cfg") as f:
+		line = f.readline()
+		while line:
+			# Skim lines until a value definition begins
+			conf_match = re.match(">>>([^>]+)>>>\s+", line)
+			if not conf_match:
+				line = f.readline()
+				continue
+			# Accumulate the conf value until the value ends
+			conf = conf_match.group(1)
+			conf_value = ""
+			line = f.readline()
+			conf_match = re.match("<<<{0}<<<\s+".format(conf), line)
+			while line and not conf_match:
+				conf_value += line
+				line = f.readline()
+				conf_match = re.match("<<<{0}<<<\s+".format(conf), line)
+			if not line:
+				raise SystemExit("Reached EOF while reading config value {}".format(conf))
+			config[conf] = conf_value.strip()
+	# Check that all necessary values were configured
+	for config_value in ['LEXICON_TITLE', 'SIDEBAR_CONTENT', 'SESSION_PAGE', "INDEX_LIST"]:
+		if config_value not in config:
+			raise SystemExit("Error: {} not set in lexipython.cfg".format(config_value))
+	return config
+
+# Building functions for output
+
+def make_cite_map(lex_list):
+	"""
+	Compiles all citation information into a single map.
+	Input:  lex_list, a list of lex structures
+	Output: a map from article titles to cited titles
+	"""
+	cite_map = {}
+	for lex in lex_list:
+		cited_titles = [cite_tuple[1] for format_id, cite_tuple in lex["citations"].items()]
+		cite_map[lex["title"]] = sorted(set(cited_titles), cmp=cmp_title)
+	return cite_map
+
+def format_content(lex, format_func):
+	"""
+	Formats citations into the lex content according to the provided function.
+	Input:  lex, a lex structure
+	        formatted_key, the key to store the formatted content under
+	        format_func, a function matching (fid, alias, dest) -> citation HTML
+	Output: lex content formatted according to format_func
+	"""
+	format_map = {
+		format_id: format_func(format_id, cite_tuple[0], cite_tuple[1])
+		for format_id, cite_tuple in lex["citations"].items()
+	}
+	return lex["content"].format(**format_map)
+
+def citation_lists(title, cite_map):
+	"""
+	Returns the citation lists for an article.
+	Input:  title, an article title
+			cite_map, generated by make_cite_map
+	Output: a list of cited article titles
+	        a list of titles of article citing this article
+	"""
+	citers = [citer_title
+		for citer_title, cited_titles in cite_map.items()
+		if title in cited_titles]
+	return cite_map[title], citers
+
+def build_article_page(lex, cite_map, config):
+	"""
+	Builds the full HTML of an article page.
+	Input:  lex, a lex structure
+			cite_map, generated by make_cite_map
+			config, a dict of config values
+	Output: the full HTML as a string
+	"""
+	lf = link_formatter(cite_map.keys())
+	# Build the article content
+	content = format_content(lex, lf)
+	# Build the article citeblock
+	cites, citedby = citation_lists(lex["title"], cite_map)
+	cites_str = " | ".join([lf(None, title, title) for title in cites])
+	citedby_str = " | ".join([lf(None, title, title) for title in citedby])
+	citeblock = ""\
+		"<div class=\"content citeblock\">\n"\
+		"<p>Citations: {cites}</p>\n"\
+		"<p>Cited by: {citedby}</p>\n"\
+		"</div>\n".format(
+			cites=cites_str,
+			citedby=citedby_str)
+	# Fill in the entry skeleton
+	entry_skeleton = load_resource("entry-page.html")
+	css = load_resource("lexicon.css")
+	return entry_skeleton.format(
+		title=lex["title"],
+		lexicon=config["LEXICON_TITLE"],
+		css=css,
+		logo=config["LOGO_FILENAME"],
+		sidebar=config["SIDEBAR_HTML"],
+		content=content,
+		citeblock=citeblock)
+
+def build_phantom_page(title, cite_map, config):
+	"""
+	Builds the full HTML of a phantom page.
+	Input:  title, the phantom title
+			cite_map, generated by make_cite_map
+			config, a dict of config values
+	Output: the full HTML as a string
+	"""
+	lf = link_formatter(cite_map.keys())
+	# Fill in the content with filler
+	content = "<p><i>This entry hasn't been written yet.</i></p>"
+	# Build the stub citeblock
+	cites, citedby = citation_lists(title, cite_map)
+	citedby_str = " | ".join([lf(None, title, title) for title in citedby])
+	citeblock = ""\
+		"<div class=\"content citeblock\">\n"\
+		"<p>Cited by: {citedby}</p>\n"\
+		"</div>\n".format(
+			cites=cites_str,
+			citedby=citedby_str)
+	# Fill in the entry skeleton
+	entry_skeleton = load_resource("entry-page.html")
+	css = load_resource("lexicon.css")
+	return entry_skeleton.format(
+		title=lex["title"],
+		lexicon=config["LEXICON_TITLE"],
+		css=css,
+		logo=config["LOGO_FILENAME"],
+		sidebar=config["SIDEBAR_HTML"],
+		content=content,
+		citeblock=citeblock)
+
+def build_stub_page(title, cite_map, config):
+	"""
+	Builds the full HTML of a stub page.
+	Input:  title, the stub title
+			cite_map, generated by make_cite_map
+			config, a dict of config values
+	Output: the full HTML as a string
+	"""
+	lf = link_formatter(cite_map.keys())
+	# Fill in the content with filler
+	content = "<p>[The handwriting is completely illegible.]</p>\n"\
+		"<hr><span class=\"signature\"><p>Ersatz Scrivener</p></span>\n"
+	# Build the stub citeblock
+	cites, citedby = citation_lists(title, cite_map)
+	citedby_str = " | ".join([lf(None, title, title) for title in citedby])
+	citeblock = ""\
+		"<div class=\"content citeblock\">\n"\
+		"<p>Citations: [Illegible]</p>\n"\
+		"<p>Cited by: {citedby}</p>\n"\
+		"</div>\n".format(
+			cites=cites_str,
+			citedby=citedby_str)
+	# Fill in the entry skeleton
+	entry_skeleton = load_resource("entry-page.html")
+	css = load_resource("lexicon.css")
+	return entry_skeleton.format(
+		title=lex["title"],
+		lexicon=config["LEXICON_TITLE"],
+		css=css,
+		logo=config["LOGO_FILENAME"],
+		sidebar=config["SIDEBAR_HTML"],
+		content=content,
+		citeblock=citeblock)
+
+def build_index_page(cite_map, config):
+	"""
+	Builds the full HTML of the index page.
+	Input:	cite_map, generated by make_cite_map
+			config, a dict of config values
+	Output:	the HTML of the index page
+	"""
+	# Count up all the titles
+	titles = sorted(
+		(set(cite_map.keys()) |
+		set([title for cited_titles in cite_map.values() for title in cited_titles]))
+		, cmp=cmp_title)
+	content = ""
+	if len(titles) == len(cite_map.keys()):
+		content = "<p>There are <b>{0}</b> entries in this lexicon.</p>\n<ul>\n".format(len(titles))
+	else:
+		content = "<p>There are <b>{0}</b> entries, <b>{1}</b> written and <b>{2}</b> phantom.</p>\n<ul>\n".format(
+			len(titles), len(cite_map.keys()), len(titles) - len(cite_map.keys()))
+	# Write all of the entries out as links under their indices
+	lf = link_formatter(cite_map.keys())
+	indices = config["INDEX_LIST"].split("\n")
+	for index_str in indices:
+		content += "<h3>{0}</h3>".format(index_str)
+		index_titles = []
+		for c in index_str.upper():
+			for title in titles:
+				if (titlestrip(title)[0] == c):
+					index_titles.append(title)
+		for title in index_titles:
+			titles.remove(title)
+			content += "<li>"
+			content += lf(None, title, title)
+			content += "</li>\n"
+	if len(titles) > 0:
+		content += "<h3>&c.</h3>".format(index_str)
+		for title in titles:
+			content += "<li>"
+			content += lf(None, title, title)
+			content += "</li>\n"
+	content += "</ul>\n"
+	# Fill in the entry skeleton
+	entry_skeleton = load_resource("entry-page.html")
+	css = load_resource("lexicon.css")
+	return entry_skeleton.format(
+		title="Index of " + config["LEXICON_TITLE"],
+		lexicon=config["LEXICON_TITLE"],
+		css=css,
+		logo=config["LOGO_FILENAME"],
+		sidebar=config["SIDEBAR_HTML"],
+		content=content,
+		citeblock="")
+
+def build_rules_page(config):
+	"""
+	Builds the full HTML of the rules page.
+	Input:	config, a dict of config values
+	Output:	the HTML of the rules page
+	"""
+	content = load_resource("rules.html")
+	# Fill in the entry skeleton
+	entry_skeleton = load_resource("entry-page.html")
+	css = load_resource("lexicon.css")
+	return entry_skeleton.format(
+		title="Rules",
+		lexicon=config["LEXICON_TITLE"],
+		css=css,
+		logo=config["LOGO_FILENAME"],
+		sidebar=config["SIDEBAR_HTML"],
+		content=content,
+		citeblock="")
+
+def build_formatting_page(config):
+	"""
+	Builds the full HTML of the formatting page.
+	Input:	config, a dict of config values
+	Output:	the HTML of the formatting page
+	"""
+	content = load_resource("formatting.html")
+	# Fill in the entry skeleton
+	entry_skeleton = load_resource("entry-page.html")
+	css = load_resource("lexicon.css")
+	return entry_skeleton.format(
+		title="Formatting",
+		lexicon=config["LEXICON_TITLE"],
+		css=css,
+		logo=config["LOGO_FILENAME"],
+		sidebar=config["SIDEBAR_HTML"],
+		content=content,
+		citeblock="")
+
+def build_session_page(config):
+	"""
+	Builds the full HTML of the session page.
+	Input:	config, a dict of config values
+	Output:	the HTML of the session page
+	"""
+	session_lex = parse_lex(config["SESSION_PAGE"])
+	content = format_content(session_lex, lambda fid,alias,dest: "<u>" + alias + "</u>")
+	# Fill in the entry skeleton
+	entry_skeleton = load_resource("entry-page.html")
+	css = load_resource("lexicon.css")
+	return entry_skeleton.format(
+		title=session_lex["title"],
+		lexicon=config["LEXICON_TITLE"],
+		css=css,
+		logo=config["LOGO_FILENAME"],
+		sidebar=config["SIDEBAR_HTML"],
+		content=content,
+		citeblock="")
+
+def build_statistics_page(cite_map, config):
+	"""
+	Builds the full HTML of the statistics page.
+	Input:	citation_map, a dictionary returned by make_cite_map
+			config, a dict of config values
+	Output:	the HTML of the statistics page
+	"""
+	content = ""
+	# Compute the pagerank
+	content += "<p><u>Top 10 by page rank:</u><br>\n"
+	G = networkx.Graph()
+	for citer, citeds in cite_map.items():
+		for cited in citeds:
+			G.add_edge(citer, cited)
+	ranks = networkx.pagerank(G)
+	sranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
+	ranking = list(enumerate(map(lambda x: x[0], sranks)))
+	content += "<br>\n".join(map(lambda x: "{0} - {1}".format(x[0]+1, x[1]), ranking[:10]))
+	content += "</p>\n"
+	# Count the top number of citations made from
+	content += "<p><u>Most citations made from:</u><br>\n"
+	citation_tally = [(kv[0], len(kv[1])) for kv in cite_map.items()]
+	citation_count = defaultdict(list)
+	for title, count in citation_tally: citation_count[count].append(title)
+	content += "<br>\n".join(map(
+			lambda kv: "{0} - {1}".format(kv[0], "; ".join(kv[1])),
+			sorted(citation_count.items(), reverse=True)[:3]))
+	content += "</p>\n"
+	# Count the top number of citations made to
+	content += "<p><u>Most citations made to:</u><br>\n"
+	all_cited = set([title for cites in cite_map.values() for title in cites])
+	cited_by_map = { cited: [citer for citer in cite_map.keys() if cited in cite_map[citer]] for cited in all_cited }
+	cited_tally = [(kv[0], len(kv[1])) for kv in cited_by_map.items()]
+	cited_count = defaultdict(list)
+	for title, count in cited_tally: cited_count[count].append(title)
+	content += "<br>\n".join(map(
+			lambda kv: "{0} - {1}".format(kv[0], "; ".join(kv[1])),
+			sorted(cited_count.items(), reverse=True)[:3]))
+	#cited_count = map(lambda kv: (kv[0], len(kv[1])), cited_by_map.items())
+	#cited_count_sort = sorted(cited_count, key=lambda x: x[1], reverse=True)
+	#top_cited_count = [kv for kv in cited_count_sort if kv[1] >= cited_count_sort[:5][-1][1]]
+	#content += "<br>\n".join(map(lambda x: "{0} - {1}".format(x[1], x[0]), top_cited_count))
+	content += "</p>\n"
+	# Fill in the entry skeleton
+	entry_skeleton = load_resource("entry-page.html")
+	css = load_resource("lexicon.css")
+	return entry_skeleton.format(
+		title="Statistics",
+		lexicon=config["LEXICON_TITLE"],
+		css=css,
+		logo=config["LOGO_FILENAME"],
+		sidebar=config["SIDEBAR_HTML"],
+		content=content,
+		citeblock="")
+
+# Summative functions
+
+def command_build(argv):
+	if len(argv) >= 3 and (argv[2] != "partial" and argv[2] != "full"):
+		print "unknown build type: " + argv[2]
+		return
+	# Set up the entries
+	config = load_config()
+	sidebar_parsed = parse_lex(config["SIDEBAR_CONTENT"])
+	config["SIDEBAR_HTML"] = format_content(sidebar_parsed, lambda fid,alias,dest: alias)
+	lexes = parse_lex_from_directory("raw/")
+	cite_map = make_cite_map(lexes)
+	written_entries = cite_map.keys()
+	phantom_entries = set([title for cites in cite_map.values() for title in cites if title not in written_entries])
+	# Clear the folder
+	print "Clearing old HTML files"
+	for filename in os.listdir("out/"):
+		if filename[-5:] == ".html":
+			print filename
+			os.remove("out/" + filename)
+	# Write the written entries
+	print "Writing written articles..."
+	for lex in lexes:
+		page = build_article_page(lex, cite_map, config)
+		with open("out/" + lex["filename"] + ".html", "w") as f:
+			f.write(page)
+		print "    Wrote " + lex["title"]
+	# Write the unwritten entries
+	if len(phantom_entries) > 0:
+		if len(argv) < 3 or argv[2] == "partial":
+			print "Writing phantom articles..."
+			for title in phantom_entries:
+				page = build_phantom_page(title, cite_map, config)
+				with open("out/" + as_filename(title) + ".html", "w") as f:
+					f.write(page)
+				print "    Wrote " + title
+		elif argv[2] == "full":
+			print "Writing stub articles..."
+			for title in phantom_entries:
+				page = build_stub_page(title, cite_map, config)
+				with open("out/" + as_filename(title) + ".html", "w") as f:
+					f.write(page)
+				print "    Wrote " + title
+		else:
+			print "ERROR: build type was " + argv[2]
+			return
+	# Write the default pages
+	print "Writing default pages"
+	page = build_rules_page(config)
+	with open("out/rules.html", "w") as f:
+		f.write(page)
+	print "    Wrote Rules"
+	page = build_formatting_page(config)
+	with open("out/formatting.html", "w") as f:
+		f.write(page)
+	print "    Wrote Formatting"
+	page = build_index_page(cite_map, config)
+	with open("out/index.html", "w") as f:
+		f.write(page)
+	print "    Wrote Index"
+	page = build_session_page(config)
+	with open("out/session.html", "w") as f:
+		f.write(page)
+	print "    Wrote Session"
+	page = build_statistics_page(cite_map, config)
+	with open("out/stats.html", "w") as f:
+		f.write(page)
+	print "    Wrote Statistics"
+
+def main():
+	if len(sys.argv) < 2:
+		print "Available commands:"
+		print " - build [partial] : Build the lexicon and generate phantom stubs for all unwritten articles."
+		print " - build full      : Build the lexicon and generate Ersatz pages for all unwritten articles."
+	elif sys.argv[1] == "build":
+		command_build(sys.argv)
+	else:
+		print "Unknown command: " + sys.argv[1]
+
+if __name__ == "__main__":
+	main()