Rewrite statistics calculations again

This commit is contained in:
Tim Van Baak 2018-12-07 08:58:41 -08:00
parent c9281b6450
commit c50676c37d
1 changed files with 104 additions and 91 deletions

View File

@ -131,120 +131,129 @@ def itemize(stats_list):
def build_statistics_page(page, articles): def build_statistics_page(page, articles):
""" """
Builds the full HTML of the statistics page. Builds the full HTML of the statistics page.
The existence of addendum articles complicates how some statistics are
computed. An addendum is an article, with its own author, body, and
citations, but in a Lexicon it exists appended to another article. To handle
this, we distinguish an _article_ from a _page_. An article is a unit parsed
from a single source file. A page is a main article and all addendums under
the same title.
""" """
min_turn = 0
max_turn = 0
article_by_title = {}
page_by_title = {}
players = set()
for main_article in articles:
key = main_article.title
page_by_title[key] = [main_article]
page_by_title[key].extend(main_article.addendums)
for article in [main_article] + main_article.addendums:
# Disambiguate articles by appending turn number to the title
key = "{0.title} (T{0.turn})".format(article)
article_by_title[key] = article
if article.player is not None:
min_turn = min(min_turn, article.turn)
max_turn = max(max_turn, article.turn)
players.add(article.player)
content = "" content = ""
stat_block = "<div class=\"contentblock\"><u>{0}</u><br>{1}</div>\n"
# Top pages by pagerank # Top pages by pagerank
# Compute pagerank for each article # Compute pagerank for each page, including all articles
G = networkx.Graph() G = networkx.Graph()
for article in articles: for page_title, articles in page_by_title.items():
for citation in article.citations: for article in articles:
G.add_edge(article.title, citation.target) for citation in article.citations:
rank_by_article = networkx.pagerank(G) G.add_edge(page_title, citation.target)
pagerank_by_title = networkx.pagerank(G)
for page_title, articles in page_by_title.items():
if page_title not in pagerank_by_title:
pagerank_by_title[page_title] = 0
# Get the top ten articles by pagerank # Get the top ten articles by pagerank
top_pageranks = reverse_statistics_dict(rank_by_article)[:10] top_pageranks = reverse_statistics_dict(pagerank_by_title)[:10]
# Replace the pageranks with ordinals # Replace the pageranks with ordinals
top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1) top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1)
# Format the ranks into strings # Format the ranks into strings
top_ranked_items = itemize(top_ranked) top_ranked_items = itemize(top_ranked)
# Write the statistics to the page # Write the statistics to the page
content += "<div class=\"contentblock\">\n" content += stat_block.format(
content += "<u>Top 10 articles by page rank:</u><br>\n" "Top 10 articles by page rank:",
content += "<br>\n".join(top_ranked_items) "<br>".join(top_ranked_items))
content += "</div>\n"
# Top number of citations made # Pages cited/cited by
citations_made = {article.title : len(article.citations) for article in articles} pages_cited = {page_title: set() for page_title in page_by_title.keys()}
top_citations = reverse_statistics_dict(citations_made)[:3] pages_cited_by = {page_title: set() for page_title in page_by_title.keys()}
for page_title, articles in page_by_title.items():
for article in articles:
for citation in article.citations:
pages_cited[page_title].add(citation.target)
pages_cited_by[citation.target].add(page_title)
for page_title, cite_titles in pages_cited.items():
pages_cited[page_title] = len(cite_titles)
for page_title, cite_titles in pages_cited_by.items():
pages_cited_by[page_title] = len(cite_titles)
top_citations = reverse_statistics_dict(pages_cited)[:3]
top_citations_items = itemize(top_citations) top_citations_items = itemize(top_citations)
content += "<div class=\"contentblock\">\n" content += stat_block.format(
content += "<u>Top articles by citations made:</u><br>\n" "Cited the most pages:",
content += "<br>\n".join(top_citations_items) "<br>".join(top_citations_items))
content += "</div>\n" top_cited = reverse_statistics_dict(pages_cited_by)[:3]
# Top number of times cited
citations_to = {article.title : len(article.citedby) for article in articles}
top_cited = reverse_statistics_dict(citations_to)[:3]
top_cited_items = itemize(top_cited) top_cited_items = itemize(top_cited)
content += "<div class=\"contentblock\">\n" content += stat_block.format(
content += "<u>Most cited articles:</u><br>\n" "Cited by the most pages:",
content += "<br>\n".join(top_cited_items) "<br>".join(top_cited_items))
content += "</div>\n"
# Top article length, roughly by words # Top article length
article_length = {} article_length_by_title = {}
for article in articles: cumulative_article_length_by_turn = {
turn_num: 0
for turn_num in range(min_turn, max_turn + 1)
}
for article_title, article in article_by_title.items():
format_map = { format_map = {
"c"+str(c.id): c.text "c"+str(c.id): c.text
for c in article.citations for c in article.citations
} }
plain_content = article.content.format(**format_map) plain_content = article.content.format(**format_map)
article_length[article.title] = len(plain_content.split()) word_count = len(plain_content.split())
top_length = reverse_statistics_dict(article_length)[:3] article_length_by_title[article_title] = word_count
for turn_num in range(min_turn, max_turn + 1):
if article.turn <= turn_num:
cumulative_article_length_by_turn[turn_num] += word_count
top_length = reverse_statistics_dict(article_length_by_title)[:3]
top_length_items = itemize(top_length) top_length_items = itemize(top_length)
content += "<div class=\"contentblock\">\n" content += stat_block.format(
content += "<u>Longest articles:</u><br>\n" "Longest articles:",
content += "<br>\n".join(top_length_items) "<br>".join(top_length_items))
content += "</div>\n"
# Total word count # Total word count
all_articles = [] len_list = [(str(k), [str(v)]) for k,v in cumulative_article_length_by_turn.items()]
for article in articles: content += stat_block.format(
all_articles.append(article) "Aggregate word count by turn:",
all_articles.extend(article.addendums) "<br>".join(itemize(len_list)))
turn_numbers = set([a.turn for a in articles if a.player is not None])
aggregate = {num: 0 for num in turn_numbers}
for turn_num in turn_numbers:
for article in all_articles:
if article.turn <= turn_num:
format_map = {
"c"+str(c.id): c.text
for c in article.citations
}
plain_content = article.content.format(**format_map)
aggregate[turn_num] += len(plain_content.split())
aggr_list = [(str(k), [str(v)]) for k,v in aggregate.items()]
content += "<div class=\"contentblock\">\n"
content += "<u>Aggregate word count by turn:</u><br>\n"
content += "<br>\n".join(itemize(aggr_list))
content += "</div>\n"
# Player pageranks # Player pageranks
# Add addendums and recompute pagerank
for article in articles:
for addendum in article.addendums:
for citation in addendum.citations:
addendum_title = "{0.title}-T{0.turn}".format(addendum)
G.add_edge(addendum_title, citation.target)
rank_by_article_all = networkx.pagerank(G)
players = sorted(set([article.player for article in articles if article.player is not None]))
pagerank_by_player = {player: 0 for player in players} pagerank_by_player = {player: 0 for player in players}
for article in articles: for page_title, articles in page_by_title.items():
if article.player is not None: page_author = articles[0].player
pagerank_by_player[article.player] += (rank_by_article_all[article.title] if page_author is not None:
if article.title in rank_by_article_all else 0) pagerank_by_player[page_author] += pagerank_by_title[page_title]
for addendum in article.addendums: for player, pagerank in pagerank_by_player.items():
addendum_title = "{0.title}-T{0.turn}".format(addendum) pagerank_by_player[player] = round(pagerank, 3)
pagerank_by_player[addendum.player] += (rank_by_article_all[addendum_title]
if addendum_title in rank_by_article_all else 0)
for player in players:
pagerank_by_player[player] = round(pagerank_by_player[player], 3)
player_rank = reverse_statistics_dict(pagerank_by_player) player_rank = reverse_statistics_dict(pagerank_by_player)
player_rank_items = itemize(player_rank) player_rank_items = itemize(player_rank)
content += "<div class=\"contentblock\">\n" content += stat_block.format(
content += "<u>Player total page rank:</u><br>\n" "Player aggregate page rank:",
content += "<br>\n".join(player_rank_items) "<br>".join(player_rank_items))
content += "</div>\n"
# Player citations made # Player citations made
cite_count_by_player = {player: 0 for player in players} pages_cited_by_player = {player: 0 for player in players}
for article in articles: for article_title, article in article_by_title.items():
if article.player is not None: if article.player is not None:
unique_citations = set([a.target for a in article.citations]) pages_cited_by_player[article.player] += len(article.citations)
cite_count_by_player[article.player] += len(unique_citations) player_cites_made_ranks = reverse_statistics_dict(pages_cited_by_player)
for addendum in article.addendums:
cite_count_by_player[addendum.player] += len(addendum.citations)
player_cites_made_ranks = reverse_statistics_dict(cite_count_by_player)
player_cites_made_items = itemize(player_cites_made_ranks) player_cites_made_items = itemize(player_cites_made_ranks)
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
content += "<u>Citations made by player:</u><br>\n" content += "<u>Citations made by player:</u><br>\n"
@ -252,20 +261,21 @@ def build_statistics_page(page, articles):
content += "</div>\n" content += "</div>\n"
# Player cited count # Player cited count
cited_times = {player : 0 for player in players} pages_cited_by_by_player = {player: 0 for player in players}
for article in articles: for page_title, articles in page_by_title.items():
if article.player is not None: page_author = articles[0].player
cited_times[article.player] += len(article.citedby) if page_author is not None:
cited_times_ranked = reverse_statistics_dict(cited_times) pages_cited_by_by_player[page_author] += len(articles[0].citedby)
cited_times_ranked = reverse_statistics_dict(pages_cited_by_by_player)
cited_times_items = itemize(cited_times_ranked) cited_times_items = itemize(cited_times_ranked)
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
content += "<u>Citations made to player:</u><br>\n" content += "<u>Citations made to article by player:</u><br>\n"
content += "<br>\n".join(cited_times_items) content += "<br>\n".join(cited_times_items)
content += "</div>\n" content += "</div>\n"
# Lowest pagerank of written articles # Lowest pagerank of written articles
exclude = [a.title for a in articles if a.player is None] exclude = [a.title for a in articles if a.player is None]
rank_by_written_only = {k:v for k,v in rank_by_article.items() if k not in exclude} rank_by_written_only = {k:v for k,v in pagerank_by_title.items() if k not in exclude}
pageranks = reverse_statistics_dict(rank_by_written_only) pageranks = reverse_statistics_dict(rank_by_written_only)
bot_ranked = list(enumerate(map(lambda x: x[1], pageranks), start=1))[-10:] bot_ranked = list(enumerate(map(lambda x: x[1], pageranks), start=1))[-10:]
# Format the ranks into strings # Format the ranks into strings
@ -276,7 +286,10 @@ def build_statistics_page(page, articles):
content += "</div>\n" content += "</div>\n"
# Undercited articles # Undercited articles
undercited = {a.title: len(a.citedby) for a in articles if len(a.citedby) <= 1} undercited = {
page_title: len(articles[0].citedby)
for page_title, articles in page_by_title.items()
if len(articles[0].citedby) < 2}
undercited_items = itemize(reverse_statistics_dict(undercited)) undercited_items = itemize(reverse_statistics_dict(undercited))
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
content += "<u>Undercited articles:</u><br>\n" content += "<u>Undercited articles:</u><br>\n"