Refactor statistics computations

This commit is contained in:
Tim Van Baak 2018-10-24 12:40:54 -07:00
parent 55f5964867
commit ada2317435
1 changed files with 88 additions and 68 deletions

View File

@ -128,6 +128,24 @@ def build_session_page(config):
content=config["SESSION_PAGE"], content=config["SESSION_PAGE"],
citeblock="") citeblock="")
def reverse_statistics_dict(stats, reverse=True):
"""
Transforms a dictionary mapping titles to a value into a list of values
and lists of titles. The list is sorted by the value, and the titles are
sorted alphabetically.
"""
rev = {}
for key, value in stats.items():
if value not in rev:
rev[value] = []
rev[value].append(key)
for key, value in rev.items():
rev[key] = sorted(value, key=lambda t: utils.titlesort(t))
return sorted(rev.items(), key=lambda x:x[0], reverse=reverse)
def itemize(stats_list):
return map(lambda x: "{0} – {1}".format(x[0], "; ".join(x[1])), stats_list)
def build_statistics_page(articles, config): def build_statistics_page(articles, config):
""" """
Builds the full HTML of the statistics page. Builds the full HTML of the statistics page.
@ -136,61 +154,58 @@ def build_statistics_page(articles, config):
cite_map = { cite_map = {
article.title : [ article.title : [
cite_tuple[1] cite_tuple[1]
for cite_tuple in article.citations.values()] for cite_tuple
in article.citations.values()
]
for article in articles} for article in articles}
# Pages by pagerank # Top pages by pagerank
content += "<div class=\"moveable\">\n" # Compute pagerank for each article
content += "<p><u>Top 10 pages by page rank:</u><br>\n"
G = networkx.Graph() G = networkx.Graph()
for citer, citeds in cite_map.items(): for citer, citeds in cite_map.items():
for cited in citeds: for cited in citeds:
G.add_edge(citer, cited) G.add_edge(citer, cited)
ranks = networkx.pagerank(G) rank_by_article = networkx.pagerank(G)
sranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True) # Get the top ten articles by pagerank
ranking = list(enumerate(map(lambda x: x[0], sranks))) top_pageranks = reverse_statistics_dict(rank_by_article)[:10]
content += "<br>\n".join(map(lambda x: "{0} &ndash; {1}".format(x[0]+1, x[1]), ranking[:10])) # Replace the pageranks with ordinals
content += "</p>\n" top_ranked = enumerate(map(lambda x: x[1], top_pageranks), start=1)
content += "</div>\n" # Format the ranks into strings
top_ranked_items = itemize(top_ranked)
# Write the statistics to the page
content += "<div class=\"moveable\">\n"
content += "<p><u>Top 10 pages by page rank:</u><br>\n"
content += "<br>\n".join(top_ranked_items)
content += "</p>\n</div>\n"
# Top number of citations made # Top number of citations made
citations_made = { title : len(cites) for title, cites in cite_map.items() }
top_citations = reverse_statistics_dict(citations_made)[:3]
top_citations_items = itemize(top_citations)
content += "<div class=\"moveable\">\n" content += "<div class=\"moveable\">\n"
content += "<p><u>Most citations made from:</u><br>\n" content += "<p><u>Most citations made from:</u><br>\n"
citation_tally = [(kv[0], len(kv[1])) for kv in cite_map.items()] content += "<br>\n".join(top_citations_items)
citation_count = defaultdict(list) content += "</p>\n</div>\n"
for title, count in citation_tally: citation_count[count].append(title)
content += "<br>\n".join(map(
lambda kv: "{0} &ndash; {1}".format(
kv[0],
"; ".join(sorted(
kv[1],
key=lambda t: utils.titlesort(t)))),
sorted(citation_count.items(), reverse=True)[:3]))
content += "</p>\n"
content += "</div>\n"
# Top number of times cited # Top number of times cited
content += "<div class=\"moveable\">\n" # Build a map of what cites each article
content += "<p><u>Most citations made to:</u><br>\n" all_cited = set([title for citeds in cite_map.values() for title in citeds])
all_cited = set([title for cites in cite_map.values() for title in cites])
cited_by_map = { cited_by_map = {
cited: [ cited: [
citer citer
for citer in cite_map.keys() for citer in cite_map.keys()
if cited in cite_map[citer]] if cited in cite_map[citer]]
for cited in all_cited } for cited in all_cited }
cited_tally = [(kv[0], len(kv[1])) for kv in cited_by_map.items()] # Compute the number of citations to each article
cited_count = defaultdict(list) citations_to = { title : len(cites) for title, cites in cited_by_map.items() }
for title, count in cited_tally: cited_count[count].append(title) top_cited = reverse_statistics_dict(citations_to)[:3]
content += "<br>\n".join(map( top_cited_items = itemize(top_cited)
lambda kv: "{0} &ndash; {1}".format(kv[0], "; ".join(sorted(kv[1]))), content += "<div class=\"moveable\">\n"
sorted(cited_count.items(), reverse=True)[:3])) content += "<p><u>Most citations made to:</u><br>\n"
content += "</p>\n" content += "<br>\n".join(top_cited_items)
content += "</div>\n" content += "</p>\n</div>\n"
# Top article length, roughly by words # Top article length, roughly by words
content += "<div class=\"moveable\">\n"
content += "<p><u>Longest article:</u><br>\n"
article_length = {} article_length = {}
for article in articles: for article in articles:
format_map = { format_map = {
@ -198,61 +213,66 @@ def build_statistics_page(articles, config):
for format_id, cite_tuple in article.citations.items() for format_id, cite_tuple in article.citations.items()
} }
plain_content = article.content.format(**format_map) plain_content = article.content.format(**format_map)
words = len(plain_content.split()) wordcount = len(plain_content.split())
article_length[article.title] = words article_length[article.title] = wordcount
content += "<br>\n".join(map( top_length = reverse_statistics_dict(article_length)[:3]
lambda kv: "{0} &ndash; {1}".format(kv[1], kv[0]), top_length_items = itemize(top_length)
sorted(article_length.items(), reverse=True, key=lambda t: t[1])[:3])) content += "<div class=\"moveable\">\n"
content += "</p>\n" content += "<p><u>Longest article:</u><br>\n"
content += "</div>\n" content += "<br>\n".join(top_length_items)
content += "</p>\n</div>\n"
# Total word count
content += "<div class=\"moveable\">\n" content += "<div class=\"moveable\">\n"
content += "<p><u>Total word count:</u><br>\n" content += "<p><u>Total word count:</u><br>\n"
content += str(sum(article_length.values())) + "</p>" content += str(sum(article_length.values())) + "</p>"
content += "</p>\n</div>\n"
# Player pageranks # Player pageranks
content += "<div class=\"moveable\">\n"
content += "<p><u>Player total page rank:</u><br>\n"
players = sorted(set([article.player for article in articles if article.player is not None])) players = sorted(set([article.player for article in articles if article.player is not None]))
articles_by = { articles_by_player = {
player : [ player : [
a a
for a in articles for a in articles
if a.player == player] if a.player == player]
for player in players} for player in players}
player_rank = { pagerank_by_player = {
player : sum(map(lambda a: ranks[a.title] if a.title in ranks else 0, articles)) player : round(
for player, articles in articles_by.items()} sum(map(
content += "<br>\n".join(map( lambda a: rank_by_article[a.title] if a.title in rank_by_article else 0,
lambda kv: "{0} &ndash; {1}".format(kv[0], round(kv[1], 3)), articles)),
sorted(player_rank.items(), key=lambda t:t[1], reverse=True))) 3)
content += "</p>\n" for player, articles
content += "</div>\n" in articles_by_player.items()}
player_rank = reverse_statistics_dict(pagerank_by_player)
player_rank_items = itemize(player_rank)
content += "<div class=\"moveable\">\n"
content += "<p><u>Player total page rank:</u><br>\n"
content += "<br>\n".join(player_rank_items)
content += "</p>\n</div>\n"
# Player citations made # Player citations made
content += "<div class=\"moveable\">\n"
content += "<p><u>Citations made by player</u><br>\n"
player_cite_count = { player_cite_count = {
player : sum(map(lambda a:len(a.wcites | a.pcites), articles)) player : sum(map(lambda a:len(a.wcites | a.pcites), articles))
for player, articles in articles_by.items()} for player, articles in articles_by_player.items()}
content += "<br>\n".join(map( player_cites_made_ranks = reverse_statistics_dict(player_cite_count)
lambda kv: "{0} &ndash; {1}".format(kv[0], kv[1]), player_cites_made_items = itemize(player_cites_made_ranks)
sorted(player_cite_count.items(), key=lambda t:t[1], reverse=True))) content += "<div class=\"moveable\">\n"
content += "</p>\n" content += "<p><u>Citations made by player</u><br>\n"
content += "</div>\n" content += "<br>\n".join(player_cites_made_items)
content += "</p>\n</div>\n"
# Player cited count # Player cited count
content += "<div class=\"moveable\">\n"
content += "<p><u>Citations made to player</u><br>\n"
cited_times = {player : 0 for player in players} cited_times = {player : 0 for player in players}
for article in articles: for article in articles:
if article.player is not None: if article.player is not None:
cited_times[article.player] += len(article.citedby) cited_times[article.player] += len(article.citedby)
content += "<br>\n".join(map( cited_times_ranked = reverse_statistics_dict(cited_times)
lambda kv: "{0} &ndash; {1}".format(kv[0], kv[1]), cited_times_items = itemize(cited_times_ranked)
sorted(cited_times.items(), key=lambda t:t[1], reverse=True))) content += "<div class=\"moveable\">\n"
content += "</p>\n" content += "<p><u>Citations made to player</u><br>\n"
content += "</div>\n" content += "<br>\n".join(cited_times_items)
content += "</p>\n</div>\n"
# Fill in the entry skeleton # Fill in the entry skeleton
entry_skeleton = utils.load_resource("entry-page.html") entry_skeleton = utils.load_resource("entry-page.html")