Statistics refactor and config hookup

2019-05-18 00:16:46 -07:00 · 2019-05-18 00:16:46 -07:00 · b3875fc7da
commit b3875fc7da
parent 8773f6b58f
3 changed files with 354 additions and 189 deletions
--- a/lexipython/build.py
+++ b/lexipython/build.py
@ -1,12 +1,12 @@
-import sys		# For argv and stderr
+# Standard library imports
 import os		# For reading directories
 import re		# For parsing lex content
-import io		# For writing pages out as UTF-8
-import networkx # For pagerank analytics
-from collections import defaultdict # For rank inversion in statistics

+# Application imports
 import utils
 from article import LexiconArticle
+from statistics import LexiconStatistics
+

 class LexiconPage:
 	"""
@ -181,191 +181,24 @@ def build_session_page(page, session_content):
 	content = "<div class=\"contentblock\">{}</div>".format(session_content)
 	return page.format(title="Session", content=content)

-def reverse_statistics_dict(stats, reverse=True):
-	"""
-	Transforms a dictionary mapping titles to a value into a list of values
-	and lists of titles. The list is sorted by the value, and the titles are
-	sorted alphabetically.
-	"""
-	rev = {}
-	for key, value in stats.items():
-		if value not in rev:
-			rev[value] = []
-		rev[value].append(key)
-	for key, value in rev.items():
-		rev[key] = sorted(value, key=lambda t: utils.titlesort(t))
-	return sorted(rev.items(), key=lambda x:x[0], reverse=reverse)
+def build_statistics_page(config, page, articles):
+	# Read the config file for which stats to publish.
+	lines = config['STATISTICS'].split("\n")
+	stats = []
+	for line in lines:
+		stat, toggle = line.split()
+		if toggle == "on":
+			stats.append("stat_" + stat)

-def itemize(stats_list):
-	return map(lambda x: "{0} &ndash; {1}".format(x[0], "; ".join(x[1])), stats_list)
-
-def build_statistics_page(page, articles):
-	"""
-	Builds the full HTML of the statistics page.
-
-	The existence of addendum articles complicates how some statistics are
-	computed. An addendum is an article, with its own author, body, and
-	citations, but in a Lexicon it exists appended to another article. To handle
-	this, we distinguish an _article_ from a _page_. An article is a unit parsed
-	from a single source file. A page is a main article and all addendums under
-	the same title.
-	"""
-	min_turn = 0
-	max_turn = 0
-	article_by_title = {}
-	page_by_title = {}
-	players = set()
-	for main_article in articles:
-		key = main_article.title
-		page_by_title[key] = [main_article]
-		page_by_title[key].extend(main_article.addendums)
-		for article in [main_article] + main_article.addendums:
-			# Disambiguate articles by appending turn number to the title
-			key = "{0.title} (T{0.turn})".format(article)
-			article_by_title[key] = article
-			if article.player is not None:
-				min_turn = min(min_turn, article.turn)
-				max_turn = max(max_turn, article.turn)
-				players.add(article.player)
-	content = ""
-	stat_block = "<div class=\"contentblock\"><u>{0}</u><br>{1}</div>\n"
-
-	# Top pages by pagerank
-	# Compute pagerank for each page, including all articles
-	G = networkx.Graph()
-	for page_title, articles in page_by_title.items():
-		for article in articles:
-			for citation in article.citations:
-				G.add_edge(page_title, citation.target)
-	pagerank_by_title = networkx.pagerank(G)
-	for page_title, articles in page_by_title.items():
-		if page_title not in pagerank_by_title:
-			pagerank_by_title[page_title] = 0
-	# Get the top ten articles by pagerank
-	top_pageranks = reverse_statistics_dict(pagerank_by_title)[:10]
-	# Replace the pageranks with ordinals
-	top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1)
-	# Format the ranks into strings
-	top_ranked_items = itemize(top_ranked)
-	# Write the statistics to the page
-	content += stat_block.format(
-		"Top 10 articles by page rank:",
-		"<br>".join(top_ranked_items))
-
-	# Pages cited/cited by
-	pages_cited    = {page_title: set() for page_title in page_by_title.keys()}
-	pages_cited_by = {page_title: set() for page_title in page_by_title.keys()}
-	for page_title, articles in page_by_title.items():
-		for article in articles:
-			for citation in article.citations:
-				pages_cited[page_title].add(citation.target)
-				pages_cited_by[citation.target].add(page_title)
-	for page_title, cite_titles in pages_cited.items():
-		pages_cited[page_title] = len(cite_titles)
-	for page_title, cite_titles in pages_cited_by.items():
-		pages_cited_by[page_title] = len(cite_titles)
-
-	top_citations = reverse_statistics_dict(pages_cited)[:3]
-	top_citations_items = itemize(top_citations)
-	content += stat_block.format(
-		"Cited the most pages:",
-		"<br>".join(top_citations_items))
-	top_cited = reverse_statistics_dict(pages_cited_by)[:3]
-	top_cited_items = itemize(top_cited)
-	content += stat_block.format(
-		"Cited by the most pages:",
-		"<br>".join(top_cited_items))
-
-	# Top article length
-	article_length_by_title = {}
-	cumulative_article_length_by_turn = {
-		turn_num: 0
-		for turn_num in range(min_turn, max_turn + 1)
-	}
-	for article_title, article in article_by_title.items():
-		format_map = {
-			"c"+str(c.id): c.text
-			for c in article.citations
-		}
-		plain_content = article.content.format(**format_map)
-		word_count = len(plain_content.split())
-		article_length_by_title[article_title] = word_count
-		for turn_num in range(min_turn, max_turn + 1):
-			if article.turn <= turn_num:
-				cumulative_article_length_by_turn[turn_num] += word_count
-	top_length = reverse_statistics_dict(article_length_by_title)[:3]
-	top_length_items = itemize(top_length)
-	content += stat_block.format(
-		"Longest articles:",
-		"<br>".join(top_length_items))
-
-	# Total word count
-	len_list = [(str(k), [str(v)]) for k,v in cumulative_article_length_by_turn.items()]
-	content += stat_block.format(
-		"Aggregate word count by turn:",
-		"<br>".join(itemize(len_list)))
-
-	# Player pageranks
-	pagerank_by_player = {player: 0 for player in players}
-	for page_title, articles in page_by_title.items():
-		page_author = articles[0].player
-		if page_author is not None:
-			pagerank_by_player[page_author] += pagerank_by_title[page_title]
-	for player, pagerank in pagerank_by_player.items():
-		pagerank_by_player[player] = round(pagerank, 3)
-	player_rank = reverse_statistics_dict(pagerank_by_player)
-	player_rank_items = itemize(player_rank)
-	content += stat_block.format(
-		"Player aggregate page rank:",
-		"<br>".join(player_rank_items))
-
-	# Player citations made
-	pages_cited_by_player = {player: 0 for player in players}
-	for article_title, article in article_by_title.items():
-		if article.player is not None:
-			pages_cited_by_player[article.player] += len(article.citations)
-	player_cites_made_ranks = reverse_statistics_dict(pages_cited_by_player)
-	player_cites_made_items = itemize(player_cites_made_ranks)
-	content += "<div class=\"contentblock\">\n"
-	content += "<u>Citations made by player:</u><br>\n"
-	content += "<br>\n".join(player_cites_made_items)
-	content += "</div>\n"
-
-	# Player cited count
-	pages_cited_by_by_player = {player: 0 for player in players}
-	for page_title, articles in page_by_title.items():
-		page_author = articles[0].player
-		if page_author is not None:
-			pages_cited_by_by_player[page_author] += len(articles[0].citedby)
-	cited_times_ranked = reverse_statistics_dict(pages_cited_by_by_player)
-	cited_times_items = itemize(cited_times_ranked)
-	content += "<div class=\"contentblock\">\n"
-	content += "<u>Citations made to article by player:</u><br>\n"
-	content += "<br>\n".join(cited_times_items)
-	content += "</div>\n"
-
-	# Lowest pagerank of written articles
-	exclude = [a.title for a in articles if a.player is None]
-	rank_by_written_only = {k:v for k,v in pagerank_by_title.items() if k not in exclude}
-	pageranks = reverse_statistics_dict(rank_by_written_only)
-	bot_ranked = list(enumerate(map(lambda x: x[1], pageranks), start=1))[-10:]
-	# Format the ranks into strings
-	bot_ranked_items = itemize(bot_ranked)
-	content += "<div class=\"contentblock\">\n"
-	content += "<u>Bottom 10 articles by pagerank:</u><br>\n"
-	content += "<br>\n".join(bot_ranked_items)
-	content += "</div>\n"
-
-	# Undercited articles
-	undercited = {
-		page_title: len(articles[0].citedby)
-		for page_title, articles in page_by_title.items()
-		if len(articles[0].citedby) < 2}
-	undercited_items = itemize(reverse_statistics_dict(undercited))
-	content += "<div class=\"contentblock\">\n"
-	content += "<u>Undercited articles:</u><br>\n"
-	content += "<br>\n".join(undercited_items)
-	content += "</div>\n"
+	# Create all the stats blocks.
+	lexicon_stats = LexiconStatistics(articles)
+	stats_blocks = []
+	for stat in stats:
+		if hasattr(lexicon_stats, stat):
+			stats_blocks.append(getattr(lexicon_stats, stat)())
+		else:
+			print("ERROR: Bad stat {}".format(stat))
+	content = "\n".join(stats_blocks)

 	# Fill in the entry skeleton
 	return page.format(title="Statistics", content=content)
@ -620,7 +453,7 @@ def build_all(path_prefix, lexicon_name):
 		f.write(build_session_page(page, config["SESSION_PAGE"]))
 	print("    Wrote Session")
 	with open(pathto("statistics", "index.html"), "w", encoding="utf-8") as f:
-		f.write(build_statistics_page(page, articles))
+		f.write(build_statistics_page(config, page, articles))
 	print("    Wrote Statistics")

 	# Write auxiliary pages
--- a/lexipython/resources/lexicon.cfg
+++ b/lexipython/resources/lexicon.cfg
@ -48,6 +48,21 @@ char:WXYZ
 etc:&c.
 <<<INDEX_LIST<<<

+# Toggles and order for whichs tatistics to display.
+# Pagerank-based statistics require networkx to be installed.
+>>>STATISTICS>>>
+top_pagerank           on
+most_citations_made    on
+most_citations_to      on
+longest_article        on
+cumulative_wordcount   off
+player_pagerank        on
+player_citations_made  on
+player_citations_to    on
+bottom_pagerank        off
+undercited             off
+<<<STATISTICS<<<
+
 # The default sorting to use on the contents page.
 # Allowed values are "index", "turn", and "player"
 >>>DEFAULT_SORT>>>
--- a/lexipython/statistics.py
+++ b/lexipython/statistics.py
@ -0,0 +1,317 @@
+# Third party imports
+try:
+	import networkx # For pagerank analytics
+	NETWORKX_ENABLED = True
+except:
+	NETWORKX_ENABLED = False
+
+# Application imports
+from utils import titlesort
+
+
+def reverse_statistics_dict(stats, reverse=True):
+	"""
+	Transforms a dictionary mapping titles to a value into a list of values
+	and lists of titles. The list is sorted by the value, and the titles are
+	sorted alphabetically.
+	"""
+	rev = {}
+	for key, value in stats.items():
+		if value not in rev:
+			rev[value] = []
+		rev[value].append(key)
+	for key, value in rev.items():
+		rev[key] = sorted(value, key=lambda t: titlesort(t))
+	return sorted(rev.items(), key=lambda x:x[0], reverse=reverse)
+
+
+def itemize(stats_list):
+	"""
+	Formats a list consisting of tuples of ranks and lists of ranked items.
+	"""
+	return map(lambda x: "{0} &ndash; {1}".format(x[0], "; ".join(x[1])), stats_list)
+
+
+class LexiconStatistics():
+	"""
+	A wrapper for a persistent statistics context with some precomputed
+	values around for convenience.
+
+	The existence of addendum articles complicates how some statistics are
+	computed. An addendum is an article, with its own author, body, and
+	citations, but in a Lexicon it exists appended to another article. To handle
+	this, we distinguish an _article_ from a _page_. An article is a unit parsed
+	from a single source file. A page is a main article and all addendums under
+	the same title.
+	"""
+
+	def __init__(self, articles):
+		self.articles = articles
+		self.min_turn = 0
+		self.max_turn = 0
+		self.players = set()
+		self.title_to_article = {}
+		self.title_to_page = {}
+		self.stat_block = "<div class=\"contentblock\"><u>{0}</u><br>{1}</div>\n"
+		# Pagerank may not be computable if networkx isn't installed.
+		self.title_to_pagerank = None
+
+		for main_article in articles:
+			page_title = main_article.title
+			self.title_to_page[page_title] = [main_article]
+			self.title_to_page[page_title].extend(main_article.addendums)
+			for article in self.title_to_page[page_title]:
+				# Disambiguate articles by appending turn number to the title
+				key = "{0.title} (T{0.turn})".format(article)
+				self.title_to_article[key] = article
+				if article.player is not None:
+					# Phantoms have turn MAXINT by convention
+					self.min_turn = min(self.min_turn, article.turn)
+					self.max_turn = max(self.max_turn, article.turn)
+					self.players.add(article.player)
+
+	def _try_populate_pagerank(self):
+		"""Computes pagerank if networkx is imported."""
+		if NETWORKX_ENABLED and self.title_to_pagerank is None:
+			# Create a citation graph linking page titles.
+			G = networkx.Graph()
+			for page_title, articles in self.title_to_page.items():
+				for article in articles:
+					for citation in article.citations:
+						G.add_edge(page_title, citation.target)
+
+			# Compute pagerank on the page citation graph.
+			self.title_to_pagerank = networkx.pagerank(G)
+			# Any article with no links in the citation graph have no pagerank.
+			# Assign these pagerank 0 to avoid key errors or missing pages in
+			# the stats.
+			for page_title, articles in self.title_to_page.items():
+				if page_title not in self.title_to_pagerank:
+					self.title_to_pagerank[page_title] = 0
+
+	def stat_top_pagerank(self):
+		"""Computes the top 10 pages by pagerank."""
+		self._try_populate_pagerank()
+
+		if not self.title_to_pagerank:
+			# If networkx was not successfully imported, skip the pagerank.
+			top_ranked_items = "networkx must be installed to compute pageranks."
+
+		else:
+			# Get the top ten articles by pagerank.
+			top_pageranks = reverse_statistics_dict(self.title_to_pagerank)[:10]
+			# Replace the pageranks with ordinals.
+			top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1)
+			# Format the ranks into strings.
+			top_ranked_items = itemize(top_ranked)
+
+		# Format the statistics block.
+		return self.stat_block.format(
+			"Top 10 articles by page rank:",
+			"<br>".join(top_ranked_items))
+
+	def stat_most_citations_made(self):
+		"""Computes the top 3 ranks for citations made FROM a page."""
+		# Determine which pages are cited from all articles on a page.
+		pages_cited = {
+			page_title: set()
+			for page_title in self.title_to_page.keys()}
+		for page_title, articles in self.title_to_page.items():
+			for article in articles:
+				for citation in article.citations:
+					pages_cited[page_title].add(citation.target)
+		# Compute the number of unique articles cited by a page.
+		for page_title, cite_titles in pages_cited.items():
+			pages_cited[page_title] = len(cite_titles)
+
+		# Reverse and itemize the citation counts.
+		top_citations = reverse_statistics_dict(pages_cited)[:3]
+		top_citations_items = itemize(top_citations)
+
+		# Format the statistics block.
+		return self.stat_block.format(
+			"Cited the most pages:",
+			"<br>".join(top_citations_items))
+
+	def stat_most_citations_to(self):
+		"""Computes the top 3 ranks for citations made TO a page."""
+		# Determine which pages cite a page.
+		pages_cited_by = {
+			page_title: set()
+			for page_title in self.title_to_page.keys()}
+		for page_title, articles in self.title_to_page.items():
+			for article in articles:
+				for citation in article.citations:
+					pages_cited_by[citation.target].add(page_title)
+		# Compute the number of unique articles that cite a page.
+		for page_title, cite_titles in pages_cited_by.items():
+			pages_cited_by[page_title] = len(cite_titles)
+
+		# Reverse and itemize the citation counts.
+		top_cited = reverse_statistics_dict(pages_cited_by)[:3]
+		top_cited_items = itemize(top_cited)
+
+		# Format the statistics block.
+		return self.stat_block.format(
+			"Cited by the most pages:",
+			"<br>".join(top_cited_items))
+
+	def stat_longest_article(self):
+		"""Computes the top 3 longest articles."""
+		# Compute the length of each article (not page).
+		title_to_article_length = {}
+		for article_title, article in self.title_to_article.items():
+			# Write all citation aliases into the article text to accurately
+			# compute word count as written.
+			format_map = {
+				"c"+str(c.id): c.text
+				for c in article.citations
+			}
+			plain_content = article.content.format(**format_map)
+			word_count = len(plain_content.split())
+			title_to_article_length[article_title] = word_count
+
+		# Reverse and itemize the article lengths.
+		top_length = reverse_statistics_dict(title_to_article_length)[:3]
+		top_length_items = itemize(top_length)
+
+		# Format the statistics block.
+		return self.stat_block.format(
+			"Longest articles:",
+			"<br>".join(top_length_items))
+
+	def stat_cumulative_wordcount(self):
+		"""Computes the cumulative word count of the lexicon."""
+		# Initialize all extant turns to 0.
+		turn_to_cumulative_wordcount = {
+			turn_num: 0
+			for turn_num in range(self.min_turn, self.max_turn + 1)
+		}
+		for article_title, article in self.title_to_article.items():
+			# Compute each article's word count.
+			format_map = {
+				"c"+str(c.id): c.text
+				for c in article.citations
+			}
+			plain_content = article.content.format(**format_map)
+			word_count = len(plain_content.split())
+			# Add the word count to each turn the article exists in.
+			for turn_num in range(self.min_turn, self.max_turn + 1):
+				if article.turn <= turn_num:
+					turn_to_cumulative_wordcount[turn_num] += word_count
+
+		# Format the statistics block.
+		len_list = [(str(k), [str(v)]) for k,v in turn_to_cumulative_wordcount.items()]
+		return self.stat_block.format(
+			"Aggregate word count by turn:",
+			"<br>".join(itemize(len_list)))
+
+	def stat_player_pagerank(self):
+		"""Computes each player's share of the lexicon's pagerank scores."""
+		self._try_populate_pagerank()
+
+		if not self.title_to_pagerank:
+			# If networkx was not successfully imported, skip the pagerank.
+			player_rank_items = "networkx must be installed to compute pageranks."
+
+		else:
+			player_to_pagerank = {
+				player: 0
+				for player in self.players}
+			# Accumulate page pagerank to the main article's author.
+			for page_title, articles in self.title_to_page.items():
+				page_author = articles[0].player
+				if page_author is not None:
+					player_to_pagerank[page_author] += self.title_to_pagerank[page_title]
+			# Round pageranks off to 3 decimal places.
+			for player, pagerank in player_to_pagerank.items():
+				player_to_pagerank[player] = round(pagerank, 3)
+
+			# Reverse and itemize the aggregated pageranks.
+			player_rank = reverse_statistics_dict(player_to_pagerank)
+			player_rank_items = itemize(player_rank)
+
+		# Format the statistics block.
+		return self.stat_block.format(
+			"Player aggregate page rank:",
+			"<br>".join(player_rank_items))
+
+	def stat_player_citations_made(self):
+		"""Computes the total number of citations made BY each player."""
+		pages_cited_by_player = {
+			player: 0
+			for player in self.players}
+		# Add the number of citations from each authored article (not page).
+		for article_title, article in self.title_to_article.items():
+			if article.player is not None:
+				pages_cited_by_player[article.player] += len(article.citations)
+
+		# Reverse and itemize the counts.
+		player_cites_made_ranks = reverse_statistics_dict(pages_cited_by_player)
+		player_cites_made_items = itemize(player_cites_made_ranks)
+
+		# Format the statistics block.
+		return self.stat_block.format(
+			"Citations made by player:",
+			"<br>".join(player_cites_made_items))
+
+	def stat_player_citations_to(self):
+		"""Computes the total number of citations made TO each player's
+		authored pages."""
+		pages_cited_by_by_player = {
+			player: 0
+			for player in self.players}
+		# Add the number of citations made to each page (not article).
+		for page_title, articles in self.title_to_page.items():
+			page_author = articles[0].player
+			if page_author is not None:
+				pages_cited_by_by_player[page_author] += len(articles[0].citedby)
+
+		# Reverse and itemize the results.
+		cited_times_ranked = reverse_statistics_dict(pages_cited_by_by_player)
+		cited_times_items = itemize(cited_times_ranked)
+
+		# Format the statistics block.
+		return self.stat_block.format(
+			"Citations made to article by player:",
+			"<br>".join(cited_times_items))
+
+	def stat_bottom_pagerank(self):
+		"""Computes the bottom 10 pages by pagerank."""
+		self._try_populate_pagerank()
+
+		if not self.title_to_pagerank:
+			# If networkx was not successfully imported, skip the pagerank.
+			bot_ranked_items = "networkx must be installed to compute pageranks."
+
+		else:
+			# Phantoms have no pagerank, because they don't cite anything.
+			exclude = [
+				a.title
+				for a in self.articles
+				if a.player is None]
+			rank_by_written_only = {
+				k:v
+				for k,v in self.title_to_pagerank.items()
+				if k not in exclude}
+			
+			# Reverse, enumerate, and itemize the bottom 10 by pagerank.
+			pageranks = reverse_statistics_dict(rank_by_written_only)
+			bot_ranked = list(enumerate(map(lambda x: x[1], pageranks), start=1))[-10:]
+			bot_ranked_items = itemize(bot_ranked)
+
+		# Format the statistics block.
+		return self.stat_block.format(
+			"Bottom 10 articles by page rank:",
+			"<br>".join(bot_ranked_items))
+
+	def stat_undercited(self):
+		"""Computes which articles have 0 or 1 citations made to them."""
+		undercited = {
+			page_title: len(articles[0].citedby)
+			for page_title, articles in self.title_to_page.items()
+			if len(articles[0].citedby) < 2}
+		undercited_items = itemize(reverse_statistics_dict(undercited))
+		return self.stat_block.format(
+			"Undercited articles:",
+			"<br>".join(undercited_items))