From b3875fc7da92cb678f8f37d13336ac584d85510f Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Sat, 18 May 2019 00:16:46 -0700 Subject: [PATCH] Statistics refactor and config hookup --- lexipython/build.py | 211 +++----------------- lexipython/resources/lexicon.cfg | 15 ++ lexipython/statistics.py | 317 +++++++++++++++++++++++++++++++ 3 files changed, 354 insertions(+), 189 deletions(-) create mode 100644 lexipython/statistics.py diff --git a/lexipython/build.py b/lexipython/build.py index a8ea3f5..45ec841 100644 --- a/lexipython/build.py +++ b/lexipython/build.py @@ -1,12 +1,12 @@ -import sys # For argv and stderr +# Standard library imports import os # For reading directories import re # For parsing lex content -import io # For writing pages out as UTF-8 -import networkx # For pagerank analytics -from collections import defaultdict # For rank inversion in statistics +# Application imports import utils from article import LexiconArticle +from statistics import LexiconStatistics + class LexiconPage: """ @@ -181,191 +181,24 @@ def build_session_page(page, session_content): content = "
{}
".format(session_content) return page.format(title="Session", content=content) -def reverse_statistics_dict(stats, reverse=True): - """ - Transforms a dictionary mapping titles to a value into a list of values - and lists of titles. The list is sorted by the value, and the titles are - sorted alphabetically. - """ - rev = {} - for key, value in stats.items(): - if value not in rev: - rev[value] = [] - rev[value].append(key) - for key, value in rev.items(): - rev[key] = sorted(value, key=lambda t: utils.titlesort(t)) - return sorted(rev.items(), key=lambda x:x[0], reverse=reverse) +def build_statistics_page(config, page, articles): + # Read the config file for which stats to publish. + lines = config['STATISTICS'].split("\n") + stats = [] + for line in lines: + stat, toggle = line.split() + if toggle == "on": + stats.append("stat_" + stat) -def itemize(stats_list): - return map(lambda x: "{0} – {1}".format(x[0], "; ".join(x[1])), stats_list) - -def build_statistics_page(page, articles): - """ - Builds the full HTML of the statistics page. - - The existence of addendum articles complicates how some statistics are - computed. An addendum is an article, with its own author, body, and - citations, but in a Lexicon it exists appended to another article. To handle - this, we distinguish an _article_ from a _page_. An article is a unit parsed - from a single source file. A page is a main article and all addendums under - the same title. - """ - min_turn = 0 - max_turn = 0 - article_by_title = {} - page_by_title = {} - players = set() - for main_article in articles: - key = main_article.title - page_by_title[key] = [main_article] - page_by_title[key].extend(main_article.addendums) - for article in [main_article] + main_article.addendums: - # Disambiguate articles by appending turn number to the title - key = "{0.title} (T{0.turn})".format(article) - article_by_title[key] = article - if article.player is not None: - min_turn = min(min_turn, article.turn) - max_turn = max(max_turn, article.turn) - players.add(article.player) - content = "" - stat_block = "
{0}
{1}
\n" - - # Top pages by pagerank - # Compute pagerank for each page, including all articles - G = networkx.Graph() - for page_title, articles in page_by_title.items(): - for article in articles: - for citation in article.citations: - G.add_edge(page_title, citation.target) - pagerank_by_title = networkx.pagerank(G) - for page_title, articles in page_by_title.items(): - if page_title not in pagerank_by_title: - pagerank_by_title[page_title] = 0 - # Get the top ten articles by pagerank - top_pageranks = reverse_statistics_dict(pagerank_by_title)[:10] - # Replace the pageranks with ordinals - top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1) - # Format the ranks into strings - top_ranked_items = itemize(top_ranked) - # Write the statistics to the page - content += stat_block.format( - "Top 10 articles by page rank:", - "
".join(top_ranked_items)) - - # Pages cited/cited by - pages_cited = {page_title: set() for page_title in page_by_title.keys()} - pages_cited_by = {page_title: set() for page_title in page_by_title.keys()} - for page_title, articles in page_by_title.items(): - for article in articles: - for citation in article.citations: - pages_cited[page_title].add(citation.target) - pages_cited_by[citation.target].add(page_title) - for page_title, cite_titles in pages_cited.items(): - pages_cited[page_title] = len(cite_titles) - for page_title, cite_titles in pages_cited_by.items(): - pages_cited_by[page_title] = len(cite_titles) - - top_citations = reverse_statistics_dict(pages_cited)[:3] - top_citations_items = itemize(top_citations) - content += stat_block.format( - "Cited the most pages:", - "
".join(top_citations_items)) - top_cited = reverse_statistics_dict(pages_cited_by)[:3] - top_cited_items = itemize(top_cited) - content += stat_block.format( - "Cited by the most pages:", - "
".join(top_cited_items)) - - # Top article length - article_length_by_title = {} - cumulative_article_length_by_turn = { - turn_num: 0 - for turn_num in range(min_turn, max_turn + 1) - } - for article_title, article in article_by_title.items(): - format_map = { - "c"+str(c.id): c.text - for c in article.citations - } - plain_content = article.content.format(**format_map) - word_count = len(plain_content.split()) - article_length_by_title[article_title] = word_count - for turn_num in range(min_turn, max_turn + 1): - if article.turn <= turn_num: - cumulative_article_length_by_turn[turn_num] += word_count - top_length = reverse_statistics_dict(article_length_by_title)[:3] - top_length_items = itemize(top_length) - content += stat_block.format( - "Longest articles:", - "
".join(top_length_items)) - - # Total word count - len_list = [(str(k), [str(v)]) for k,v in cumulative_article_length_by_turn.items()] - content += stat_block.format( - "Aggregate word count by turn:", - "
".join(itemize(len_list))) - - # Player pageranks - pagerank_by_player = {player: 0 for player in players} - for page_title, articles in page_by_title.items(): - page_author = articles[0].player - if page_author is not None: - pagerank_by_player[page_author] += pagerank_by_title[page_title] - for player, pagerank in pagerank_by_player.items(): - pagerank_by_player[player] = round(pagerank, 3) - player_rank = reverse_statistics_dict(pagerank_by_player) - player_rank_items = itemize(player_rank) - content += stat_block.format( - "Player aggregate page rank:", - "
".join(player_rank_items)) - - # Player citations made - pages_cited_by_player = {player: 0 for player in players} - for article_title, article in article_by_title.items(): - if article.player is not None: - pages_cited_by_player[article.player] += len(article.citations) - player_cites_made_ranks = reverse_statistics_dict(pages_cited_by_player) - player_cites_made_items = itemize(player_cites_made_ranks) - content += "
\n" - content += "Citations made by player:
\n" - content += "
\n".join(player_cites_made_items) - content += "
\n" - - # Player cited count - pages_cited_by_by_player = {player: 0 for player in players} - for page_title, articles in page_by_title.items(): - page_author = articles[0].player - if page_author is not None: - pages_cited_by_by_player[page_author] += len(articles[0].citedby) - cited_times_ranked = reverse_statistics_dict(pages_cited_by_by_player) - cited_times_items = itemize(cited_times_ranked) - content += "
\n" - content += "Citations made to article by player:
\n" - content += "
\n".join(cited_times_items) - content += "
\n" - - # Lowest pagerank of written articles - exclude = [a.title for a in articles if a.player is None] - rank_by_written_only = {k:v for k,v in pagerank_by_title.items() if k not in exclude} - pageranks = reverse_statistics_dict(rank_by_written_only) - bot_ranked = list(enumerate(map(lambda x: x[1], pageranks), start=1))[-10:] - # Format the ranks into strings - bot_ranked_items = itemize(bot_ranked) - content += "
\n" - content += "Bottom 10 articles by pagerank:
\n" - content += "
\n".join(bot_ranked_items) - content += "
\n" - - # Undercited articles - undercited = { - page_title: len(articles[0].citedby) - for page_title, articles in page_by_title.items() - if len(articles[0].citedby) < 2} - undercited_items = itemize(reverse_statistics_dict(undercited)) - content += "
\n" - content += "Undercited articles:
\n" - content += "
\n".join(undercited_items) - content += "
\n" + # Create all the stats blocks. + lexicon_stats = LexiconStatistics(articles) + stats_blocks = [] + for stat in stats: + if hasattr(lexicon_stats, stat): + stats_blocks.append(getattr(lexicon_stats, stat)()) + else: + print("ERROR: Bad stat {}".format(stat)) + content = "\n".join(stats_blocks) # Fill in the entry skeleton return page.format(title="Statistics", content=content) @@ -620,7 +453,7 @@ def build_all(path_prefix, lexicon_name): f.write(build_session_page(page, config["SESSION_PAGE"])) print(" Wrote Session") with open(pathto("statistics", "index.html"), "w", encoding="utf-8") as f: - f.write(build_statistics_page(page, articles)) + f.write(build_statistics_page(config, page, articles)) print(" Wrote Statistics") # Write auxiliary pages diff --git a/lexipython/resources/lexicon.cfg b/lexipython/resources/lexicon.cfg index 66d0d35..9d03e53 100644 --- a/lexipython/resources/lexicon.cfg +++ b/lexipython/resources/lexicon.cfg @@ -48,6 +48,21 @@ char:WXYZ etc:&c. <<>>STATISTICS>>> +top_pagerank on +most_citations_made on +most_citations_to on +longest_article on +cumulative_wordcount off +player_pagerank on +player_citations_made on +player_citations_to on +bottom_pagerank off +undercited off +<<>>DEFAULT_SORT>>> diff --git a/lexipython/statistics.py b/lexipython/statistics.py new file mode 100644 index 0000000..85b4ec9 --- /dev/null +++ b/lexipython/statistics.py @@ -0,0 +1,317 @@ +# Third party imports +try: + import networkx # For pagerank analytics + NETWORKX_ENABLED = True +except: + NETWORKX_ENABLED = False + +# Application imports +from utils import titlesort + + +def reverse_statistics_dict(stats, reverse=True): + """ + Transforms a dictionary mapping titles to a value into a list of values + and lists of titles. The list is sorted by the value, and the titles are + sorted alphabetically. + """ + rev = {} + for key, value in stats.items(): + if value not in rev: + rev[value] = [] + rev[value].append(key) + for key, value in rev.items(): + rev[key] = sorted(value, key=lambda t: titlesort(t)) + return sorted(rev.items(), key=lambda x:x[0], reverse=reverse) + + +def itemize(stats_list): + """ + Formats a list consisting of tuples of ranks and lists of ranked items. + """ + return map(lambda x: "{0} – {1}".format(x[0], "; ".join(x[1])), stats_list) + + +class LexiconStatistics(): + """ + A wrapper for a persistent statistics context with some precomputed + values around for convenience. + + The existence of addendum articles complicates how some statistics are + computed. An addendum is an article, with its own author, body, and + citations, but in a Lexicon it exists appended to another article. To handle + this, we distinguish an _article_ from a _page_. An article is a unit parsed + from a single source file. A page is a main article and all addendums under + the same title. + """ + + def __init__(self, articles): + self.articles = articles + self.min_turn = 0 + self.max_turn = 0 + self.players = set() + self.title_to_article = {} + self.title_to_page = {} + self.stat_block = "
{0}
{1}
\n" + # Pagerank may not be computable if networkx isn't installed. + self.title_to_pagerank = None + + for main_article in articles: + page_title = main_article.title + self.title_to_page[page_title] = [main_article] + self.title_to_page[page_title].extend(main_article.addendums) + for article in self.title_to_page[page_title]: + # Disambiguate articles by appending turn number to the title + key = "{0.title} (T{0.turn})".format(article) + self.title_to_article[key] = article + if article.player is not None: + # Phantoms have turn MAXINT by convention + self.min_turn = min(self.min_turn, article.turn) + self.max_turn = max(self.max_turn, article.turn) + self.players.add(article.player) + + def _try_populate_pagerank(self): + """Computes pagerank if networkx is imported.""" + if NETWORKX_ENABLED and self.title_to_pagerank is None: + # Create a citation graph linking page titles. + G = networkx.Graph() + for page_title, articles in self.title_to_page.items(): + for article in articles: + for citation in article.citations: + G.add_edge(page_title, citation.target) + + # Compute pagerank on the page citation graph. + self.title_to_pagerank = networkx.pagerank(G) + # Any article with no links in the citation graph have no pagerank. + # Assign these pagerank 0 to avoid key errors or missing pages in + # the stats. + for page_title, articles in self.title_to_page.items(): + if page_title not in self.title_to_pagerank: + self.title_to_pagerank[page_title] = 0 + + def stat_top_pagerank(self): + """Computes the top 10 pages by pagerank.""" + self._try_populate_pagerank() + + if not self.title_to_pagerank: + # If networkx was not successfully imported, skip the pagerank. + top_ranked_items = "networkx must be installed to compute pageranks." + + else: + # Get the top ten articles by pagerank. + top_pageranks = reverse_statistics_dict(self.title_to_pagerank)[:10] + # Replace the pageranks with ordinals. + top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1) + # Format the ranks into strings. + top_ranked_items = itemize(top_ranked) + + # Format the statistics block. + return self.stat_block.format( + "Top 10 articles by page rank:", + "
".join(top_ranked_items)) + + def stat_most_citations_made(self): + """Computes the top 3 ranks for citations made FROM a page.""" + # Determine which pages are cited from all articles on a page. + pages_cited = { + page_title: set() + for page_title in self.title_to_page.keys()} + for page_title, articles in self.title_to_page.items(): + for article in articles: + for citation in article.citations: + pages_cited[page_title].add(citation.target) + # Compute the number of unique articles cited by a page. + for page_title, cite_titles in pages_cited.items(): + pages_cited[page_title] = len(cite_titles) + + # Reverse and itemize the citation counts. + top_citations = reverse_statistics_dict(pages_cited)[:3] + top_citations_items = itemize(top_citations) + + # Format the statistics block. + return self.stat_block.format( + "Cited the most pages:", + "
".join(top_citations_items)) + + def stat_most_citations_to(self): + """Computes the top 3 ranks for citations made TO a page.""" + # Determine which pages cite a page. + pages_cited_by = { + page_title: set() + for page_title in self.title_to_page.keys()} + for page_title, articles in self.title_to_page.items(): + for article in articles: + for citation in article.citations: + pages_cited_by[citation.target].add(page_title) + # Compute the number of unique articles that cite a page. + for page_title, cite_titles in pages_cited_by.items(): + pages_cited_by[page_title] = len(cite_titles) + + # Reverse and itemize the citation counts. + top_cited = reverse_statistics_dict(pages_cited_by)[:3] + top_cited_items = itemize(top_cited) + + # Format the statistics block. + return self.stat_block.format( + "Cited by the most pages:", + "
".join(top_cited_items)) + + def stat_longest_article(self): + """Computes the top 3 longest articles.""" + # Compute the length of each article (not page). + title_to_article_length = {} + for article_title, article in self.title_to_article.items(): + # Write all citation aliases into the article text to accurately + # compute word count as written. + format_map = { + "c"+str(c.id): c.text + for c in article.citations + } + plain_content = article.content.format(**format_map) + word_count = len(plain_content.split()) + title_to_article_length[article_title] = word_count + + # Reverse and itemize the article lengths. + top_length = reverse_statistics_dict(title_to_article_length)[:3] + top_length_items = itemize(top_length) + + # Format the statistics block. + return self.stat_block.format( + "Longest articles:", + "
".join(top_length_items)) + + def stat_cumulative_wordcount(self): + """Computes the cumulative word count of the lexicon.""" + # Initialize all extant turns to 0. + turn_to_cumulative_wordcount = { + turn_num: 0 + for turn_num in range(self.min_turn, self.max_turn + 1) + } + for article_title, article in self.title_to_article.items(): + # Compute each article's word count. + format_map = { + "c"+str(c.id): c.text + for c in article.citations + } + plain_content = article.content.format(**format_map) + word_count = len(plain_content.split()) + # Add the word count to each turn the article exists in. + for turn_num in range(self.min_turn, self.max_turn + 1): + if article.turn <= turn_num: + turn_to_cumulative_wordcount[turn_num] += word_count + + # Format the statistics block. + len_list = [(str(k), [str(v)]) for k,v in turn_to_cumulative_wordcount.items()] + return self.stat_block.format( + "Aggregate word count by turn:", + "
".join(itemize(len_list))) + + def stat_player_pagerank(self): + """Computes each player's share of the lexicon's pagerank scores.""" + self._try_populate_pagerank() + + if not self.title_to_pagerank: + # If networkx was not successfully imported, skip the pagerank. + player_rank_items = "networkx must be installed to compute pageranks." + + else: + player_to_pagerank = { + player: 0 + for player in self.players} + # Accumulate page pagerank to the main article's author. + for page_title, articles in self.title_to_page.items(): + page_author = articles[0].player + if page_author is not None: + player_to_pagerank[page_author] += self.title_to_pagerank[page_title] + # Round pageranks off to 3 decimal places. + for player, pagerank in player_to_pagerank.items(): + player_to_pagerank[player] = round(pagerank, 3) + + # Reverse and itemize the aggregated pageranks. + player_rank = reverse_statistics_dict(player_to_pagerank) + player_rank_items = itemize(player_rank) + + # Format the statistics block. + return self.stat_block.format( + "Player aggregate page rank:", + "
".join(player_rank_items)) + + def stat_player_citations_made(self): + """Computes the total number of citations made BY each player.""" + pages_cited_by_player = { + player: 0 + for player in self.players} + # Add the number of citations from each authored article (not page). + for article_title, article in self.title_to_article.items(): + if article.player is not None: + pages_cited_by_player[article.player] += len(article.citations) + + # Reverse and itemize the counts. + player_cites_made_ranks = reverse_statistics_dict(pages_cited_by_player) + player_cites_made_items = itemize(player_cites_made_ranks) + + # Format the statistics block. + return self.stat_block.format( + "Citations made by player:", + "
".join(player_cites_made_items)) + + def stat_player_citations_to(self): + """Computes the total number of citations made TO each player's + authored pages.""" + pages_cited_by_by_player = { + player: 0 + for player in self.players} + # Add the number of citations made to each page (not article). + for page_title, articles in self.title_to_page.items(): + page_author = articles[0].player + if page_author is not None: + pages_cited_by_by_player[page_author] += len(articles[0].citedby) + + # Reverse and itemize the results. + cited_times_ranked = reverse_statistics_dict(pages_cited_by_by_player) + cited_times_items = itemize(cited_times_ranked) + + # Format the statistics block. + return self.stat_block.format( + "Citations made to article by player:", + "
".join(cited_times_items)) + + def stat_bottom_pagerank(self): + """Computes the bottom 10 pages by pagerank.""" + self._try_populate_pagerank() + + if not self.title_to_pagerank: + # If networkx was not successfully imported, skip the pagerank. + bot_ranked_items = "networkx must be installed to compute pageranks." + + else: + # Phantoms have no pagerank, because they don't cite anything. + exclude = [ + a.title + for a in self.articles + if a.player is None] + rank_by_written_only = { + k:v + for k,v in self.title_to_pagerank.items() + if k not in exclude} + + # Reverse, enumerate, and itemize the bottom 10 by pagerank. + pageranks = reverse_statistics_dict(rank_by_written_only) + bot_ranked = list(enumerate(map(lambda x: x[1], pageranks), start=1))[-10:] + bot_ranked_items = itemize(bot_ranked) + + # Format the statistics block. + return self.stat_block.format( + "Bottom 10 articles by page rank:", + "
".join(bot_ranked_items)) + + def stat_undercited(self): + """Computes which articles have 0 or 1 citations made to them.""" + undercited = { + page_title: len(articles[0].citedby) + for page_title, articles in self.title_to_page.items() + if len(articles[0].citedby) < 2} + undercited_items = itemize(reverse_statistics_dict(undercited)) + return self.stat_block.format( + "Undercited articles:", + "
".join(undercited_items))