Statistics refactor and config hookup

2019-05-18 00:16:46 -07:00 · 2019-05-18 00:16:46 -07:00 · b3875fc7da
commit b3875fc7da
parent 8773f6b58f
3 changed files with 354 additions and 189 deletions
--- a/lexipython/build.py
+++ b/lexipython/build.py
@ -1,12 +1,12 @@
-import sys		# For argv and stderr
+# Standard library imports
 import os		# For reading directories
 import re		# For parsing lex content
 import io		# For writing pages out as UTF-8
 import networkx # For pagerank analytics
 from collections import defaultdict # For rank inversion in statistics
 # Application imports
 import utils
 from article import LexiconArticle
 from statistics import LexiconStatistics
 class LexiconPage:
 	"""
@ -181,191 +181,24 @@ def build_session_page(page, session_content):
 	content = "<div class=\"contentblock\">{}</div>".format(session_content)
 	return page.format(title="Session", content=content)
-def reverse_statistics_dict(stats, reverse=True):
+def build_statistics_page(config, page, articles):
-	"""
+	# Read the config file for which stats to publish.
-	Transforms a dictionary mapping titles to a value into a list of values
+	lines = config['STATISTICS'].split("\n")
-	and lists of titles. The list is sorted by the value, and the titles are
+	stats = []
-	sorted alphabetically.
+	for line in lines:
-	"""
+		stat, toggle = line.split()
-	rev = {}
+		if toggle == "on":
-	for key, value in stats.items():
+			stats.append("stat_" + stat)
 		if value not in rev:
 			rev[value] = []
 		rev[value].append(key)
 	for key, value in rev.items():
 		rev[key] = sorted(value, key=lambda t: utils.titlesort(t))
 	return sorted(rev.items(), key=lambda x:x[0], reverse=reverse)
-def itemize(stats_list):
+	# Create all the stats blocks.
-	return map(lambda x: "{0} &ndash; {1}".format(x[0], "; ".join(x[1])), stats_list)
+	lexicon_stats = LexiconStatistics(articles)
-
+	stats_blocks = []
-def build_statistics_page(page, articles):
+	for stat in stats:
-	"""
+		if hasattr(lexicon_stats, stat):
-	Builds the full HTML of the statistics page.
+			stats_blocks.append(getattr(lexicon_stats, stat)())
-
+		else:
-	The existence of addendum articles complicates how some statistics are
+			print("ERROR: Bad stat {}".format(stat))
-	computed. An addendum is an article, with its own author, body, and
+	content = "\n".join(stats_blocks)
 	citations, but in a Lexicon it exists appended to another article. To handle
 	this, we distinguish an _article_ from a _page_. An article is a unit parsed
 	from a single source file. A page is a main article and all addendums under
 	the same title.
 	"""
 	min_turn = 0
 	max_turn = 0
 	article_by_title = {}
 	page_by_title = {}
 	players = set()
 	for main_article in articles:
 		key = main_article.title
 		page_by_title[key] = [main_article]
 		page_by_title[key].extend(main_article.addendums)
 		for article in [main_article] + main_article.addendums:
 			# Disambiguate articles by appending turn number to the title
 			key = "{0.title} (T{0.turn})".format(article)
 			article_by_title[key] = article
 			if article.player is not None:
 				min_turn = min(min_turn, article.turn)
 				max_turn = max(max_turn, article.turn)
 				players.add(article.player)
 	content = ""
 	stat_block = "<div class=\"contentblock\"><u>{0}</u><br>{1}</div>\n"
 	# Top pages by pagerank
 	# Compute pagerank for each page, including all articles
 	G = networkx.Graph()
 	for page_title, articles in page_by_title.items():
 		for article in articles:
 			for citation in article.citations:
 				G.add_edge(page_title, citation.target)
 	pagerank_by_title = networkx.pagerank(G)
 	for page_title, articles in page_by_title.items():
 		if page_title not in pagerank_by_title:
 			pagerank_by_title[page_title] = 0
 	# Get the top ten articles by pagerank
 	top_pageranks = reverse_statistics_dict(pagerank_by_title)[:10]
 	# Replace the pageranks with ordinals
 	top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1)
 	# Format the ranks into strings
 	top_ranked_items = itemize(top_ranked)
 	# Write the statistics to the page
 	content += stat_block.format(
 		"Top 10 articles by page rank:",
 		"<br>".join(top_ranked_items))
 	# Pages cited/cited by
 	pages_cited    = {page_title: set() for page_title in page_by_title.keys()}
 	pages_cited_by = {page_title: set() for page_title in page_by_title.keys()}
 	for page_title, articles in page_by_title.items():
 		for article in articles:
 			for citation in article.citations:
 				pages_cited[page_title].add(citation.target)
 				pages_cited_by[citation.target].add(page_title)
 	for page_title, cite_titles in pages_cited.items():
 		pages_cited[page_title] = len(cite_titles)
 	for page_title, cite_titles in pages_cited_by.items():
 		pages_cited_by[page_title] = len(cite_titles)
 	top_citations = reverse_statistics_dict(pages_cited)[:3]
 	top_citations_items = itemize(top_citations)
 	content += stat_block.format(
 		"Cited the most pages:",
 		"<br>".join(top_citations_items))
 	top_cited = reverse_statistics_dict(pages_cited_by)[:3]
 	top_cited_items = itemize(top_cited)
 	content += stat_block.format(
 		"Cited by the most pages:",
 		"<br>".join(top_cited_items))
 	# Top article length
 	article_length_by_title = {}
 	cumulative_article_length_by_turn = {
 		turn_num: 0
 		for turn_num in range(min_turn, max_turn + 1)
 	}
 	for article_title, article in article_by_title.items():
 		format_map = {
 			"c"+str(c.id): c.text
 			for c in article.citations
 		}
 		plain_content = article.content.format(**format_map)
 		word_count = len(plain_content.split())
 		article_length_by_title[article_title] = word_count
 		for turn_num in range(min_turn, max_turn + 1):
 			if article.turn <= turn_num:
 				cumulative_article_length_by_turn[turn_num] += word_count
 	top_length = reverse_statistics_dict(article_length_by_title)[:3]
 	top_length_items = itemize(top_length)
 	content += stat_block.format(
 		"Longest articles:",
 		"<br>".join(top_length_items))
 	# Total word count
 	len_list = [(str(k), [str(v)]) for k,v in cumulative_article_length_by_turn.items()]
 	content += stat_block.format(
 		"Aggregate word count by turn:",
 		"<br>".join(itemize(len_list)))
 	# Player pageranks
 	pagerank_by_player = {player: 0 for player in players}
 	for page_title, articles in page_by_title.items():
 		page_author = articles[0].player
 		if page_author is not None:
 			pagerank_by_player[page_author] += pagerank_by_title[page_title]
 	for player, pagerank in pagerank_by_player.items():
 		pagerank_by_player[player] = round(pagerank, 3)
 	player_rank = reverse_statistics_dict(pagerank_by_player)
 	player_rank_items = itemize(player_rank)
 	content += stat_block.format(
 		"Player aggregate page rank:",
 		"<br>".join(player_rank_items))
 	# Player citations made
 	pages_cited_by_player = {player: 0 for player in players}
 	for article_title, article in article_by_title.items():
 		if article.player is not None:
 			pages_cited_by_player[article.player] += len(article.citations)
 	player_cites_made_ranks = reverse_statistics_dict(pages_cited_by_player)
 	player_cites_made_items = itemize(player_cites_made_ranks)
 	content += "<div class=\"contentblock\">\n"
 	content += "<u>Citations made by player:</u><br>\n"
 	content += "<br>\n".join(player_cites_made_items)
 	content += "</div>\n"
 	# Player cited count
 	pages_cited_by_by_player = {player: 0 for player in players}
 	for page_title, articles in page_by_title.items():
 		page_author = articles[0].player
 		if page_author is not None:
 			pages_cited_by_by_player[page_author] += len(articles[0].citedby)
 	cited_times_ranked = reverse_statistics_dict(pages_cited_by_by_player)
 	cited_times_items = itemize(cited_times_ranked)
 	content += "<div class=\"contentblock\">\n"
 	content += "<u>Citations made to article by player:</u><br>\n"
 	content += "<br>\n".join(cited_times_items)
 	content += "</div>\n"
 	# Lowest pagerank of written articles
 	exclude = [a.title for a in articles if a.player is None]
 	rank_by_written_only = {k:v for k,v in pagerank_by_title.items() if k not in exclude}
 	pageranks = reverse_statistics_dict(rank_by_written_only)
 	bot_ranked = list(enumerate(map(lambda x: x[1], pageranks), start=1))[-10:]
 	# Format the ranks into strings
 	bot_ranked_items = itemize(bot_ranked)
 	content += "<div class=\"contentblock\">\n"
 	content += "<u>Bottom 10 articles by pagerank:</u><br>\n"
 	content += "<br>\n".join(bot_ranked_items)
 	content += "</div>\n"
 	# Undercited articles
 	undercited = {
 		page_title: len(articles[0].citedby)
 		for page_title, articles in page_by_title.items()
 		if len(articles[0].citedby) < 2}
 	undercited_items = itemize(reverse_statistics_dict(undercited))
 	content += "<div class=\"contentblock\">\n"
 	content += "<u>Undercited articles:</u><br>\n"
 	content += "<br>\n".join(undercited_items)
 	content += "</div>\n"
 	# Fill in the entry skeleton
 	return page.format(title="Statistics", content=content)
@ -620,7 +453,7 @@ def build_all(path_prefix, lexicon_name):
 		f.write(build_session_page(page, config["SESSION_PAGE"]))
 	print("    Wrote Session")
 	with open(pathto("statistics", "index.html"), "w", encoding="utf-8") as f:
-		f.write(build_statistics_page(page, articles))
+		f.write(build_statistics_page(config, page, articles))
 	print("    Wrote Statistics")
 	# Write auxiliary pages
--- a/lexipython/resources/lexicon.cfg
+++ b/lexipython/resources/lexicon.cfg
@ -48,6 +48,21 @@ char:WXYZ
 etc:&c.
 <<<INDEX_LIST<<<
 # Toggles and order for whichs tatistics to display.
 # Pagerank-based statistics require networkx to be installed.
 >>>STATISTICS>>>
 top_pagerank           on
 most_citations_made    on
 most_citations_to      on
 longest_article        on
 cumulative_wordcount   off
 player_pagerank        on
 player_citations_made  on
 player_citations_to    on
 bottom_pagerank        off
 undercited             off
 <<<STATISTICS<<<
 # The default sorting to use on the contents page.
 # Allowed values are "index", "turn", and "player"
 >>>DEFAULT_SORT>>>
--- a/lexipython/statistics.py
+++ b/lexipython/statistics.py
@ -0,0 +1,317 @@
 # Third party imports
 try:
 	import networkx # For pagerank analytics
 	NETWORKX_ENABLED = True
 except:
 	NETWORKX_ENABLED = False
 # Application imports
 from utils import titlesort
 def reverse_statistics_dict(stats, reverse=True):
 	"""
 	Transforms a dictionary mapping titles to a value into a list of values
 	and lists of titles. The list is sorted by the value, and the titles are
 	sorted alphabetically.
 	"""
 	rev = {}
 	for key, value in stats.items():
 		if value not in rev:
 			rev[value] = []
 		rev[value].append(key)
 	for key, value in rev.items():
 		rev[key] = sorted(value, key=lambda t: titlesort(t))
 	return sorted(rev.items(), key=lambda x:x[0], reverse=reverse)
 def itemize(stats_list):
 	"""
 	Formats a list consisting of tuples of ranks and lists of ranked items.
 	"""
 	return map(lambda x: "{0} &ndash; {1}".format(x[0], "; ".join(x[1])), stats_list)
 class LexiconStatistics():
 	"""
 	A wrapper for a persistent statistics context with some precomputed
 	values around for convenience.
 	The existence of addendum articles complicates how some statistics are
 	computed. An addendum is an article, with its own author, body, and
 	citations, but in a Lexicon it exists appended to another article. To handle
 	this, we distinguish an _article_ from a _page_. An article is a unit parsed
 	from a single source file. A page is a main article and all addendums under
 	the same title.
 	"""
 	def __init__(self, articles):
 		self.articles = articles
 		self.min_turn = 0
 		self.max_turn = 0
 		self.players = set()
 		self.title_to_article = {}
 		self.title_to_page = {}
 		self.stat_block = "<div class=\"contentblock\"><u>{0}</u><br>{1}</div>\n"
 		# Pagerank may not be computable if networkx isn't installed.
 		self.title_to_pagerank = None
 		for main_article in articles:
 			page_title = main_article.title
 			self.title_to_page[page_title] = [main_article]
 			self.title_to_page[page_title].extend(main_article.addendums)
 			for article in self.title_to_page[page_title]:
 				# Disambiguate articles by appending turn number to the title
 				key = "{0.title} (T{0.turn})".format(article)
 				self.title_to_article[key] = article
 				if article.player is not None:
 					# Phantoms have turn MAXINT by convention
 					self.min_turn = min(self.min_turn, article.turn)
 					self.max_turn = max(self.max_turn, article.turn)
 					self.players.add(article.player)
 	def _try_populate_pagerank(self):
 		"""Computes pagerank if networkx is imported."""
 		if NETWORKX_ENABLED and self.title_to_pagerank is None:
 			# Create a citation graph linking page titles.
 			G = networkx.Graph()
 			for page_title, articles in self.title_to_page.items():
 				for article in articles:
 					for citation in article.citations:
 						G.add_edge(page_title, citation.target)
 			# Compute pagerank on the page citation graph.
 			self.title_to_pagerank = networkx.pagerank(G)
 			# Any article with no links in the citation graph have no pagerank.
 			# Assign these pagerank 0 to avoid key errors or missing pages in
 			# the stats.
 			for page_title, articles in self.title_to_page.items():
 				if page_title not in self.title_to_pagerank:
 					self.title_to_pagerank[page_title] = 0
 	def stat_top_pagerank(self):
 		"""Computes the top 10 pages by pagerank."""
 		self._try_populate_pagerank()
 		if not self.title_to_pagerank:
 			# If networkx was not successfully imported, skip the pagerank.
 			top_ranked_items = "networkx must be installed to compute pageranks."
 		else:
 			# Get the top ten articles by pagerank.
 			top_pageranks = reverse_statistics_dict(self.title_to_pagerank)[:10]
 			# Replace the pageranks with ordinals.
 			top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1)
 			# Format the ranks into strings.
 			top_ranked_items = itemize(top_ranked)
 		# Format the statistics block.
 		return self.stat_block.format(
 			"Top 10 articles by page rank:",
 			"<br>".join(top_ranked_items))
 	def stat_most_citations_made(self):
 		"""Computes the top 3 ranks for citations made FROM a page."""
 		# Determine which pages are cited from all articles on a page.
 		pages_cited = {
 			page_title: set()
 			for page_title in self.title_to_page.keys()}
 		for page_title, articles in self.title_to_page.items():
 			for article in articles:
 				for citation in article.citations:
 					pages_cited[page_title].add(citation.target)
 		# Compute the number of unique articles cited by a page.
 		for page_title, cite_titles in pages_cited.items():
 			pages_cited[page_title] = len(cite_titles)
 		# Reverse and itemize the citation counts.
 		top_citations = reverse_statistics_dict(pages_cited)[:3]
 		top_citations_items = itemize(top_citations)
 		# Format the statistics block.
 		return self.stat_block.format(
 			"Cited the most pages:",
 			"<br>".join(top_citations_items))
 	def stat_most_citations_to(self):
 		"""Computes the top 3 ranks for citations made TO a page."""
 		# Determine which pages cite a page.
 		pages_cited_by = {
 			page_title: set()
 			for page_title in self.title_to_page.keys()}
 		for page_title, articles in self.title_to_page.items():
 			for article in articles:
 				for citation in article.citations:
 					pages_cited_by[citation.target].add(page_title)
 		# Compute the number of unique articles that cite a page.
 		for page_title, cite_titles in pages_cited_by.items():
 			pages_cited_by[page_title] = len(cite_titles)
 		# Reverse and itemize the citation counts.
 		top_cited = reverse_statistics_dict(pages_cited_by)[:3]
 		top_cited_items = itemize(top_cited)
 		# Format the statistics block.
 		return self.stat_block.format(
 			"Cited by the most pages:",
 			"<br>".join(top_cited_items))
 	def stat_longest_article(self):
 		"""Computes the top 3 longest articles."""
 		# Compute the length of each article (not page).
 		title_to_article_length = {}
 		for article_title, article in self.title_to_article.items():
 			# Write all citation aliases into the article text to accurately
 			# compute word count as written.
 			format_map = {
 				"c"+str(c.id): c.text
 				for c in article.citations
 			}
 			plain_content = article.content.format(**format_map)
 			word_count = len(plain_content.split())
 			title_to_article_length[article_title] = word_count
 		# Reverse and itemize the article lengths.
 		top_length = reverse_statistics_dict(title_to_article_length)[:3]
 		top_length_items = itemize(top_length)
 		# Format the statistics block.
 		return self.stat_block.format(
 			"Longest articles:",
 			"<br>".join(top_length_items))
 	def stat_cumulative_wordcount(self):
 		"""Computes the cumulative word count of the lexicon."""
 		# Initialize all extant turns to 0.
 		turn_to_cumulative_wordcount = {
 			turn_num: 0
 			for turn_num in range(self.min_turn, self.max_turn + 1)
 		}
 		for article_title, article in self.title_to_article.items():
 			# Compute each article's word count.
 			format_map = {
 				"c"+str(c.id): c.text
 				for c in article.citations
 			}
 			plain_content = article.content.format(**format_map)
 			word_count = len(plain_content.split())
 			# Add the word count to each turn the article exists in.
 			for turn_num in range(self.min_turn, self.max_turn + 1):
 				if article.turn <= turn_num:
 					turn_to_cumulative_wordcount[turn_num] += word_count
 		# Format the statistics block.
 		len_list = [(str(k), [str(v)]) for k,v in turn_to_cumulative_wordcount.items()]
 		return self.stat_block.format(
 			"Aggregate word count by turn:",
 			"<br>".join(itemize(len_list)))
 	def stat_player_pagerank(self):
 		"""Computes each player's share of the lexicon's pagerank scores."""
 		self._try_populate_pagerank()
 		if not self.title_to_pagerank:
 			# If networkx was not successfully imported, skip the pagerank.
 			player_rank_items = "networkx must be installed to compute pageranks."
 		else:
 			player_to_pagerank = {
 				player: 0
 				for player in self.players}
 			# Accumulate page pagerank to the main article's author.
 			for page_title, articles in self.title_to_page.items():
 				page_author = articles[0].player
 				if page_author is not None:
 					player_to_pagerank[page_author] += self.title_to_pagerank[page_title]
 			# Round pageranks off to 3 decimal places.
 			for player, pagerank in player_to_pagerank.items():
 				player_to_pagerank[player] = round(pagerank, 3)
 			# Reverse and itemize the aggregated pageranks.
 			player_rank = reverse_statistics_dict(player_to_pagerank)
 			player_rank_items = itemize(player_rank)
 		# Format the statistics block.
 		return self.stat_block.format(
 			"Player aggregate page rank:",
 			"<br>".join(player_rank_items))
 	def stat_player_citations_made(self):
 		"""Computes the total number of citations made BY each player."""
 		pages_cited_by_player = {
 			player: 0
 			for player in self.players}
 		# Add the number of citations from each authored article (not page).
 		for article_title, article in self.title_to_article.items():
 			if article.player is not None:
 				pages_cited_by_player[article.player] += len(article.citations)
 		# Reverse and itemize the counts.
 		player_cites_made_ranks = reverse_statistics_dict(pages_cited_by_player)
 		player_cites_made_items = itemize(player_cites_made_ranks)
 		# Format the statistics block.
 		return self.stat_block.format(
 			"Citations made by player:",
 			"<br>".join(player_cites_made_items))
 	def stat_player_citations_to(self):
 		"""Computes the total number of citations made TO each player's
 		authored pages."""
 		pages_cited_by_by_player = {
 			player: 0
 			for player in self.players}
 		# Add the number of citations made to each page (not article).
 		for page_title, articles in self.title_to_page.items():
 			page_author = articles[0].player
 			if page_author is not None:
 				pages_cited_by_by_player[page_author] += len(articles[0].citedby)
 		# Reverse and itemize the results.
 		cited_times_ranked = reverse_statistics_dict(pages_cited_by_by_player)
 		cited_times_items = itemize(cited_times_ranked)
 		# Format the statistics block.
 		return self.stat_block.format(
 			"Citations made to article by player:",
 			"<br>".join(cited_times_items))
 	def stat_bottom_pagerank(self):
 		"""Computes the bottom 10 pages by pagerank."""
 		self._try_populate_pagerank()
 		if not self.title_to_pagerank:
 			# If networkx was not successfully imported, skip the pagerank.
 			bot_ranked_items = "networkx must be installed to compute pageranks."
 		else:
 			# Phantoms have no pagerank, because they don't cite anything.
 			exclude = [
 				a.title
 				for a in self.articles
 				if a.player is None]
 			rank_by_written_only = {
 				k:v
 				for k,v in self.title_to_pagerank.items()
 				if k not in exclude}
 			# Reverse, enumerate, and itemize the bottom 10 by pagerank.
 			pageranks = reverse_statistics_dict(rank_by_written_only)
 			bot_ranked = list(enumerate(map(lambda x: x[1], pageranks), start=1))[-10:]
 			bot_ranked_items = itemize(bot_ranked)
 		# Format the statistics block.
 		return self.stat_block.format(
 			"Bottom 10 articles by page rank:",
 			"<br>".join(bot_ranked_items))
 	def stat_undercited(self):
 		"""Computes which articles have 0 or 1 citations made to them."""
 		undercited = {
 			page_title: len(articles[0].citedby)
 			for page_title, articles in self.title_to_page.items()
 			if len(articles[0].citedby) < 2}
 		undercited_items = itemize(reverse_statistics_dict(undercited))
 		return self.stat_block.format(
 			"Undercited articles:",
 			"<br>".join(undercited_items))