diff --git a/src/build.py b/src/build.py
index a00979b..4530322 100644
--- a/src/build.py
+++ b/src/build.py
@@ -131,120 +131,129 @@ def itemize(stats_list):
def build_statistics_page(page, articles):
"""
Builds the full HTML of the statistics page.
+
+ The existence of addendum articles complicates how some statistics are
+ computed. An addendum is an article, with its own author, body, and
+ citations, but in a Lexicon it exists appended to another article. To handle
+ this, we distinguish an _article_ from a _page_. An article is a unit parsed
+ from a single source file. A page is a main article and all addendums under
+ the same title.
"""
+ min_turn = 0
+ max_turn = 0
+ article_by_title = {}
+ page_by_title = {}
+ players = set()
+ for main_article in articles:
+ key = main_article.title
+ page_by_title[key] = [main_article]
+ page_by_title[key].extend(main_article.addendums)
+ for article in [main_article] + main_article.addendums:
+ # Disambiguate articles by appending turn number to the title
+ key = "{0.title} (T{0.turn})".format(article)
+ article_by_title[key] = article
+ if article.player is not None:
+ min_turn = min(min_turn, article.turn)
+ max_turn = max(max_turn, article.turn)
+ players.add(article.player)
content = ""
+ stat_block = "
{0}
{1}
\n"
# Top pages by pagerank
- # Compute pagerank for each article
+ # Compute pagerank for each page, including all articles
G = networkx.Graph()
- for article in articles:
- for citation in article.citations:
- G.add_edge(article.title, citation.target)
- rank_by_article = networkx.pagerank(G)
+ for page_title, articles in page_by_title.items():
+ for article in articles:
+ for citation in article.citations:
+ G.add_edge(page_title, citation.target)
+ pagerank_by_title = networkx.pagerank(G)
+ for page_title, articles in page_by_title.items():
+ if page_title not in pagerank_by_title:
+ pagerank_by_title[page_title] = 0
# Get the top ten articles by pagerank
- top_pageranks = reverse_statistics_dict(rank_by_article)[:10]
+ top_pageranks = reverse_statistics_dict(pagerank_by_title)[:10]
# Replace the pageranks with ordinals
top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1)
# Format the ranks into strings
top_ranked_items = itemize(top_ranked)
# Write the statistics to the page
- content += "\n"
- content += "Top 10 articles by page rank:
\n"
- content += "
\n".join(top_ranked_items)
- content += "
\n"
+ content += stat_block.format(
+ "Top 10 articles by page rank:",
+ "
".join(top_ranked_items))
- # Top number of citations made
- citations_made = {article.title : len(article.citations) for article in articles}
- top_citations = reverse_statistics_dict(citations_made)[:3]
+ # Pages cited/cited by
+ pages_cited = {page_title: set() for page_title in page_by_title.keys()}
+ pages_cited_by = {page_title: set() for page_title in page_by_title.keys()}
+ for page_title, articles in page_by_title.items():
+ for article in articles:
+ for citation in article.citations:
+ pages_cited[page_title].add(citation.target)
+ pages_cited_by[citation.target].add(page_title)
+ for page_title, cite_titles in pages_cited.items():
+ pages_cited[page_title] = len(cite_titles)
+ for page_title, cite_titles in pages_cited_by.items():
+ pages_cited_by[page_title] = len(cite_titles)
+
+ top_citations = reverse_statistics_dict(pages_cited)[:3]
top_citations_items = itemize(top_citations)
- content += "\n"
- content += "Top articles by citations made:
\n"
- content += "
\n".join(top_citations_items)
- content += "
\n"
-
- # Top number of times cited
- citations_to = {article.title : len(article.citedby) for article in articles}
- top_cited = reverse_statistics_dict(citations_to)[:3]
+ content += stat_block.format(
+ "Cited the most pages:",
+ "
".join(top_citations_items))
+ top_cited = reverse_statistics_dict(pages_cited_by)[:3]
top_cited_items = itemize(top_cited)
- content += "\n"
- content += "Most cited articles:
\n"
- content += "
\n".join(top_cited_items)
- content += "
\n"
+ content += stat_block.format(
+ "Cited by the most pages:",
+ "
".join(top_cited_items))
- # Top article length, roughly by words
- article_length = {}
- for article in articles:
+ # Top article length
+ article_length_by_title = {}
+ cumulative_article_length_by_turn = {
+ turn_num: 0
+ for turn_num in range(min_turn, max_turn + 1)
+ }
+ for article_title, article in article_by_title.items():
format_map = {
"c"+str(c.id): c.text
for c in article.citations
}
plain_content = article.content.format(**format_map)
- article_length[article.title] = len(plain_content.split())
- top_length = reverse_statistics_dict(article_length)[:3]
+ word_count = len(plain_content.split())
+ article_length_by_title[article_title] = word_count
+ for turn_num in range(min_turn, max_turn + 1):
+ if article.turn <= turn_num:
+ cumulative_article_length_by_turn[turn_num] += word_count
+ top_length = reverse_statistics_dict(article_length_by_title)[:3]
top_length_items = itemize(top_length)
- content += "\n"
- content += "Longest articles:
\n"
- content += "
\n".join(top_length_items)
- content += "
\n"
+ content += stat_block.format(
+ "Longest articles:",
+ "
".join(top_length_items))
# Total word count
- all_articles = []
- for article in articles:
- all_articles.append(article)
- all_articles.extend(article.addendums)
- turn_numbers = set([a.turn for a in articles if a.player is not None])
- aggregate = {num: 0 for num in turn_numbers}
- for turn_num in turn_numbers:
- for article in all_articles:
- if article.turn <= turn_num:
- format_map = {
- "c"+str(c.id): c.text
- for c in article.citations
- }
- plain_content = article.content.format(**format_map)
- aggregate[turn_num] += len(plain_content.split())
- aggr_list = [(str(k), [str(v)]) for k,v in aggregate.items()]
- content += "\n"
- content += "Aggregate word count by turn:
\n"
- content += "
\n".join(itemize(aggr_list))
- content += "
\n"
+ len_list = [(str(k), [str(v)]) for k,v in cumulative_article_length_by_turn.items()]
+ content += stat_block.format(
+ "Aggregate word count by turn:",
+ "
".join(itemize(len_list)))
# Player pageranks
- # Add addendums and recompute pagerank
- for article in articles:
- for addendum in article.addendums:
- for citation in addendum.citations:
- addendum_title = "{0.title}-T{0.turn}".format(addendum)
- G.add_edge(addendum_title, citation.target)
- rank_by_article_all = networkx.pagerank(G)
- players = sorted(set([article.player for article in articles if article.player is not None]))
pagerank_by_player = {player: 0 for player in players}
- for article in articles:
- if article.player is not None:
- pagerank_by_player[article.player] += (rank_by_article_all[article.title]
- if article.title in rank_by_article_all else 0)
- for addendum in article.addendums:
- addendum_title = "{0.title}-T{0.turn}".format(addendum)
- pagerank_by_player[addendum.player] += (rank_by_article_all[addendum_title]
- if addendum_title in rank_by_article_all else 0)
- for player in players:
- pagerank_by_player[player] = round(pagerank_by_player[player], 3)
+ for page_title, articles in page_by_title.items():
+ page_author = articles[0].player
+ if page_author is not None:
+ pagerank_by_player[page_author] += pagerank_by_title[page_title]
+ for player, pagerank in pagerank_by_player.items():
+ pagerank_by_player[player] = round(pagerank, 3)
player_rank = reverse_statistics_dict(pagerank_by_player)
player_rank_items = itemize(player_rank)
- content += "\n"
- content += "Player total page rank:
\n"
- content += "
\n".join(player_rank_items)
- content += "
\n"
+ content += stat_block.format(
+ "Player aggregate page rank:",
+ "
".join(player_rank_items))
# Player citations made
- cite_count_by_player = {player: 0 for player in players}
- for article in articles:
+ pages_cited_by_player = {player: 0 for player in players}
+ for article_title, article in article_by_title.items():
if article.player is not None:
- unique_citations = set([a.target for a in article.citations])
- cite_count_by_player[article.player] += len(unique_citations)
- for addendum in article.addendums:
- cite_count_by_player[addendum.player] += len(addendum.citations)
- player_cites_made_ranks = reverse_statistics_dict(cite_count_by_player)
+ pages_cited_by_player[article.player] += len(article.citations)
+ player_cites_made_ranks = reverse_statistics_dict(pages_cited_by_player)
player_cites_made_items = itemize(player_cites_made_ranks)
content += "\n"
content += "Citations made by player:
\n"
@@ -252,20 +261,21 @@ def build_statistics_page(page, articles):
content += "
\n"
# Player cited count
- cited_times = {player : 0 for player in players}
- for article in articles:
- if article.player is not None:
- cited_times[article.player] += len(article.citedby)
- cited_times_ranked = reverse_statistics_dict(cited_times)
+ pages_cited_by_by_player = {player: 0 for player in players}
+ for page_title, articles in page_by_title.items():
+ page_author = articles[0].player
+ if page_author is not None:
+ pages_cited_by_by_player[page_author] += len(articles[0].citedby)
+ cited_times_ranked = reverse_statistics_dict(pages_cited_by_by_player)
cited_times_items = itemize(cited_times_ranked)
content += "\n"
- content += "Citations made to player:
\n"
+ content += "Citations made to article by player:
\n"
content += "
\n".join(cited_times_items)
content += "
\n"
# Lowest pagerank of written articles
exclude = [a.title for a in articles if a.player is None]
- rank_by_written_only = {k:v for k,v in rank_by_article.items() if k not in exclude}
+ rank_by_written_only = {k:v for k,v in pagerank_by_title.items() if k not in exclude}
pageranks = reverse_statistics_dict(rank_by_written_only)
bot_ranked = list(enumerate(map(lambda x: x[1], pageranks), start=1))[-10:]
# Format the ranks into strings
@@ -276,7 +286,10 @@ def build_statistics_page(page, articles):
content += "\n"
# Undercited articles
- undercited = {a.title: len(a.citedby) for a in articles if len(a.citedby) <= 1}
+ undercited = {
+ page_title: len(articles[0].citedby)
+ for page_title, articles in page_by_title.items()
+ if len(articles[0].citedby) < 2}
undercited_items = itemize(reverse_statistics_dict(undercited))
content += "\n"
content += "Undercited articles:
\n"