Refactor citations and add addendum articles

This commit is contained in:
Tim Van Baak 2018-10-31 15:22:15 -07:00
parent 00cc4e9cfe
commit 7490cd6f7f
2 changed files with 205 additions and 135 deletions

View File

@ -3,21 +3,50 @@ import sys
import re import re
import src.utils as utils import src.utils as utils
class LexiconCitation:
"""
Represents information about a single citation in a Lexicon article.
Members:
id int: citation id within the article, corresponding to a "{cN}"
format hook
text string: alias text linked to the citation target
target string: title of the article being cited
article LexiconArticle: article cited, None until interlink
"""
def __init__(self, id, citation_text, citation_target, article=None):
self.id = id
self.text = citation_text
self.target = citation_target
self.article = article
def __repr__(self):
return "<LexiconCitation(id={0.id}, text=\"{0.text}\", target=\"{0.target}\")>".format(self)
def __str__(self):
return "<[{0.id}]:[[{0.text}|{0.target}]]>".format(self)
def format(self, format_str):
return format_str.format(**self.__dict__)
class LexiconArticle: class LexiconArticle:
""" """
A Lexicon article and its metadata. A Lexicon article and its metadata.
Members: Members defined by __init__:
player string: the player of the article player string: player who wrote the article
turn integer: the turn the article was written for turn integer: turn the article was written for
title string: the article title title string: article title
title_filesafe string: the title, escaped, used for filenames title_filesafe string: title, escaped, used for filenames
content string: the HTML content, with citations replaced by format hooks content string: HTML content, with citations replaced by format hooks
citations dict mapping format hook string to tuple of link alias and link target title citations list of LexiconCitations: citations made by the article
wcites list: titles of written articles cited link_class string: CSS class to interpolate (for styling phantoms)
pcites list: titles of phantom articles cited
citedby list: titles of articles that cite this Members undefined until interlink:
The last three are filled in by populate(). addendums list of LexiconArticles: addendum articles to this article
citedby set of LexiconArticles: articles that cite this article
prev_article LexiconArticle: the previous article in read order
next_article LexiconArticle: the next article in read order
""" """
def __init__(self, player, turn, title, content, citations): def __init__(self, player, turn, title, content, citations):
@ -30,9 +59,17 @@ class LexiconArticle:
self.title_filesafe = utils.titleescape(title) self.title_filesafe = utils.titleescape(title)
self.content = content self.content = content
self.citations = citations self.citations = citations
self.wcites = set() self.link_class = "class=\"phantom\"" if player is None else ""
self.pcites = set() self.addendums = []
self.citedby = set() self.citedby = set()
self.prev_article = None
self.next_article = None
def __repr__(self):
return "<LexiconArticle(title={0.title}, turn={0.turn}, player={0.player})>".format(self)
def __str__(self):
return "<\"{0.title}\", {0.player} turn {0.turn}>".format(self)
@staticmethod @staticmethod
def from_file_raw(raw_content): def from_file_raw(raw_content):
@ -68,7 +105,7 @@ class LexiconArticle:
# Parse the content and extract citations # Parse the content and extract citations
paras = re.split("\n\n+", content_raw.strip()) paras = re.split("\n\n+", content_raw.strip())
content = "" content = ""
citations = {} citations = []
format_id = 1 format_id = 1
if not paras: if not paras:
print("No content") print("No content")
@ -91,7 +128,8 @@ class LexiconArticle:
cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3) cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3)
cite_title = utils.titlecase(re.sub(r"\s+", " ", link_match.group(3))) cite_title = utils.titlecase(re.sub(r"\s+", " ", link_match.group(3)))
# Record the citation # Record the citation
citations["c"+str(format_id)] = (cite_text, cite_title) cite = LexiconCitation(format_id, cite_text, cite_title)
citations.append(cite)
# Stitch the format id in place of the citation # Stitch the format id in place of the citation
para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):] para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):]
format_id += 1 # Increment to the next format citation format_id += 1 # Increment to the next format citation
@ -129,83 +167,125 @@ class LexiconArticle:
return articles return articles
@staticmethod @staticmethod
def populate(lexicon_articles): def interlink(lexicon_articles):
""" """
Given a list of lexicon articles, fills out citation information Fills out fields on articles that require other articles for context.
for each article and creates phantom pages for missing articles. Creates phantom articles.
""" """
article_by_title = {article.title : article for article in lexicon_articles} # Sort out which articles are addendums and which titles are phantoms
# Determine all articles that exist or should exist written_titles = set()
extant_titles = set([citation[1] for article in lexicon_articles for citation in article.citations]) cited_titles = set()
# Interlink all citations article_by_title = {}
for article in lexicon_articles: written_articles_ordered = sorted(lexicon_articles, key=lambda a: (a.turn, a.title))
for cite_tuple in article.citations.values(): for written_article in written_articles_ordered:
target = cite_tuple[1] # Track main articles by title
# Create article objects for phantom citations if written_article.title not in written_titles:
if target not in article_by_title: article_by_title[written_article.title] = written_article
article_by_title[target] = LexiconArticle(None, sys.maxsize, target, written_titles.add(written_article.title)
"<p><i>This entry hasn't been written yet.</i></p>", {}) # Append addendums to their parents
# Interlink citations else:
if article_by_title[target].player is None: parent = article_by_title[written_article.title]
article.pcites.add(target) parent.addendums.append(written_article)
else: # Collect all cited titles
article.wcites.add(target) for citation in written_article.citations:
article_by_title[target].citedby.add(article.title) cited_titles.add(citation.target)
return list(article_by_title.values()) # Create articles for each phantom title
for title in cited_titles - written_titles:
phantom_article = LexiconArticle(
None, sys.maxsize, title,
"<p><i>This entry hasn't been written yet.</i></p>", {})
article_by_title[title] = phantom_article
# To interlink the articles, each citation needs to have its .article
# filled in, and that article needs its citedby updated.
for parent in article_by_title.values():
under_title = [parent] + parent.addendums
for citing_article in under_title:
for citation in citing_article.citations:
target_article = article_by_title[citation.target]
citation.article = target_article
target_article.citedby.add(citing_article)
# Sort the articles by turn and title, then fill in prev/next fields
articles_ordered = sorted(article_by_title.values(), key=lambda a: (a.turn, utils.titlesort(a.title)))
for i in range(len(articles_ordered)):
articles_ordered[i].prev_article = articles_ordered[i-1] if i != 0 else None
articles_ordered[i].next_article = articles_ordered[i+1] if i != len(articles_ordered)-1 else None
return articles_ordered
def build_default_contentblock(self): def build_default_content(self):
""" """
Formats citations into the article content as normal HTML links Builds the contents of the content div for an article page.
and returns the result. """
content = ""
# Build the main article content block
main_body = self.build_default_article_body()
content += "<div class=\"contentblock\"><h1>{}</h1>{}</div>\n".format(
self.title, main_body)
# Build the main citation content block
main_citations = self.build_default_citeblock(
self.prev_article, self.next_article)
if main_citations:
content += "<div class=\"contentblock citeblock\">{}</div>\n".format(
main_citations)
# Build any addendum content blocks
for addendum in self.addendums:
add_body = addendum.build_default_article_body()
content += "<div class=\"contentblock\">{}</div>\n".format(add_body)
add_citations = addendum.build_default_citeblock(None, None)
if add_citations:
content += "<div class=\"contentblock\">{}</div>\n".format(
add_citations)
return content
def build_default_article_body(self):
"""
Formats citations into the article text and returns the article body.
""" """
format_map = { format_map = {
format_id: "<a href=\"{1}.html\"{2}>{0}</a>".format( "c"+str(c.id) : c.format("<a {article.link_class} "\
cite_tuple[0], utils.titleescape(cite_tuple[1]), "href=\"{article.title_filesafe}.html\">{text}</a>")
"" if cite_tuple[1] in self.wcites else " class=\"phantom\"") for c in self.citations
for format_id, cite_tuple in self.citations.items()
} }
article_content = self.content.format(**format_map) return self.content.format(**format_map)
return "<div class=\"contentblock\">\n<h1>{}</h1>\n{}</div>\n".format(
self.title,
article_content)
def build_default_citeblock(self, prev_article, next_article): def build_default_citeblock(self, prev_article, next_article):
""" """
Builds the citeblock content HTML for use in regular article pages. Builds the contents of a citation contentblock. For each defined target,
For each defined target, links the target page as Previous or Next. links the target page as Previous or Next. Prev/next and cites/citedby
elements are not included if they have no content.
""" """
citeblock = "<div class=\"contentblock citeblock\">\n" content = ""
# Prev/next links: # Prev/next links:
if next_article is not None or prev_article is not None: if next_article is not None or prev_article is not None:
prev_link = ("<a href=\"{}.html\"{}>&#8592; Previous</a>".format( prev_link = ("<a {0.link_class} href=\"{0.title_filesafe}.html\">&#8592; Previous</a>".format(
prev_article.title_filesafe, prev_article)
" class=\"phantom\"" if prev_article.player is None else "")
if prev_article is not None else "") if prev_article is not None else "")
next_link = ("<a href=\"{}.html\"{}>Next &#8594;</a>".format( next_link = ("<a {0.link_class} href=\"{0.title_filesafe}.html\">Next &#8594;</a>".format(
next_article.title_filesafe, next_article)
" class=\"phantom\"" if next_article.player is None else "")
if next_article is not None else "") if next_article is not None else "")
citeblock += "<table><tr>\n<td>{}</td>\n<td>{}</td>\n</table></tr>\n".format( content += "<table><tr>\n<td>{}</td>\n<td>{}</td>\n</table></tr>\n".format(
prev_link, next_link) prev_link, next_link)
# Citations # Citations
cites_links = [ cites_titles = set()
"<a href=\"{1}.html\"{2}>{0}</a>".format( cites_links = []
title, utils.titleescape(title), for citation in sorted(self.citations, key=lambda c: (utils.titlesort(c.target), c.id)):
"" if title in self.wcites else " class=\"phantom\"") if citation.target not in cites_titles:
for title in sorted( cites_titles.add(citation.target)
self.wcites | self.pcites, cites_links.append(
key=lambda t: utils.titlesort(t))] citation.format(
"<a {article.link_class} href=\"{article.title_filesafe}.html\">{article.title}</a>"))
cites_str = " / ".join(cites_links) cites_str = " / ".join(cites_links)
if len(cites_str) < 1: cites_str = "&mdash;" if len(cites_str) > 0:
citeblock += "<p>Citations: {}</p>\n".format(cites_str) content += "<p>Citations: {}</p>\n".format(cites_str)
# Citedby # Citedby
citedby_links = [ citedby_titles = set()
"<a href=\"{1}.html\">{0}</a>".format( citedby_links = []
title, utils.titleescape(title)) for article in sorted(self.citedby, key=lambda a: (utils.titlesort(a.title), a.turn)):
for title in sorted( if article.title not in citedby_titles:
self.citedby, citedby_titles.add(article.title)
key=lambda t: utils.titlesort(t))] citedby_links.append(
"<a {0.link_class} href=\"{0.title_filesafe}.html\">{0.title}</a>".format(article))
citedby_str = " / ".join(citedby_links) citedby_str = " / ".join(citedby_links)
if len(citedby_str) < 1: citedby_str = "&mdash;" if len(citedby_str) > 0:
citeblock += "<p>Cited by: {}</p>\n</div>\n".format(citedby_str) content += "<p>Cited by: {}</p>\n".format(citedby_str)
return citeblock
return content

View File

@ -133,20 +133,13 @@ def build_statistics_page(page, articles):
Builds the full HTML of the statistics page. Builds the full HTML of the statistics page.
""" """
content = "" content = ""
cite_map = {
article.title : [
cite_tuple[1]
for cite_tuple
in article.citations.values()
]
for article in articles}
# Top pages by pagerank # Top pages by pagerank
# Compute pagerank for each article # Compute pagerank for each article
G = networkx.Graph() G = networkx.Graph()
for citer, citeds in cite_map.items(): for article in articles:
for cited in citeds: for citation in article.citations:
G.add_edge(citer, cited) G.add_edge(article.title, citation.target)
rank_by_article = networkx.pagerank(G) rank_by_article = networkx.pagerank(G)
# Get the top ten articles by pagerank # Get the top ten articles by pagerank
top_pageranks = reverse_statistics_dict(rank_by_article)[:10] top_pageranks = reverse_statistics_dict(rank_by_article)[:10]
@ -156,34 +149,25 @@ def build_statistics_page(page, articles):
top_ranked_items = itemize(top_ranked) top_ranked_items = itemize(top_ranked)
# Write the statistics to the page # Write the statistics to the page
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
content += "<u>Top 10 pages by page rank:</u><br>\n" content += "<u>Top 10 articles by page rank:</u><br>\n"
content += "<br>\n".join(top_ranked_items) content += "<br>\n".join(top_ranked_items)
content += "</div>\n" content += "</div>\n"
# Top number of citations made # Top number of citations made
citations_made = { title : len(cites) for title, cites in cite_map.items() } citations_made = {article.title : len(article.citations) for article in articles}
top_citations = reverse_statistics_dict(citations_made)[:3] top_citations = reverse_statistics_dict(citations_made)[:3]
top_citations_items = itemize(top_citations) top_citations_items = itemize(top_citations)
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
content += "<u>Most citations made from:</u><br>\n" content += "<u>Top articles by citations made:</u><br>\n"
content += "<br>\n".join(top_citations_items) content += "<br>\n".join(top_citations_items)
content += "</div>\n" content += "</div>\n"
# Top number of times cited # Top number of times cited
# Build a map of what cites each article citations_to = {article.title : len(article.citedby) for article in articles}
all_cited = set([title for citeds in cite_map.values() for title in citeds])
cited_by_map = {
cited: [
citer
for citer in cite_map.keys()
if cited in cite_map[citer]]
for cited in all_cited }
# Compute the number of citations to each article
citations_to = { title : len(cites) for title, cites in cited_by_map.items() }
top_cited = reverse_statistics_dict(citations_to)[:3] top_cited = reverse_statistics_dict(citations_to)[:3]
top_cited_items = itemize(top_cited) top_cited_items = itemize(top_cited)
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
content += "<u>Most citations made to:</u><br>\n" content += "<u>Most cited articles:</u><br>\n"
content += "<br>\n".join(top_cited_items) content += "<br>\n".join(top_cited_items)
content += "</div>\n" content += "</div>\n"
@ -191,16 +175,15 @@ def build_statistics_page(page, articles):
article_length = {} article_length = {}
for article in articles: for article in articles:
format_map = { format_map = {
format_id: cite_tuple[0] "c"+str(c.id): c.text
for format_id, cite_tuple in article.citations.items() for c in article.citations
} }
plain_content = article.content.format(**format_map) plain_content = article.content.format(**format_map)
wordcount = len(plain_content.split()) article_length[article.title] = len(plain_content.split())
article_length[article.title] = wordcount
top_length = reverse_statistics_dict(article_length)[:3] top_length = reverse_statistics_dict(article_length)[:3]
top_length_items = itemize(top_length) top_length_items = itemize(top_length)
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
content += "<u>Longest article:</u><br>\n" content += "<u>Longest articles:</u><br>\n"
content += "<br>\n".join(top_length_items) content += "<br>\n".join(top_length_items)
content += "</div>\n" content += "</div>\n"
@ -211,21 +194,23 @@ def build_statistics_page(page, articles):
content += "</div>\n" content += "</div>\n"
# Player pageranks # Player pageranks
# Add addendums and recompute pagerank
for article in articles:
for addendum in article.addendums:
for citation in addendum.citations:
addendum_title = "{0.title}-T{0.turn}".format(addendum)
G.add_edge(addendum_title, citation.target)
rank_by_article = networkx.pagerank(G)
players = sorted(set([article.player for article in articles if article.player is not None])) players = sorted(set([article.player for article in articles if article.player is not None]))
articles_by_player = { pagerank_by_player = {player: 0 for player in players}
player : [ for article in articles:
a if article.player is not None:
for a in articles pagerank_by_player[article.player] += rank_by_article[article.title]
if a.player == player] for addendum in article.addendums:
for player in players} addendum_title = "{0.title}-T{0.turn}".format(addendum)
pagerank_by_player = { pagerank_by_player[addendum_title] += rank_by_article[addendum_title]
player : round( for player in players:
sum(map( pagerank_by_player[player] = round(pagerank_by_player[player], 3)
lambda a: rank_by_article[a.title] if a.title in rank_by_article else 0,
articles)),
3)
for player, articles
in articles_by_player.items()}
player_rank = reverse_statistics_dict(pagerank_by_player) player_rank = reverse_statistics_dict(pagerank_by_player)
player_rank_items = itemize(player_rank) player_rank_items = itemize(player_rank)
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
@ -234,13 +219,17 @@ def build_statistics_page(page, articles):
content += "</div>\n" content += "</div>\n"
# Player citations made # Player citations made
player_cite_count = { cite_count_by_player = {player: 0 for player in players}
player : sum(map(lambda a:len(a.wcites | a.pcites), articles)) for article in articles:
for player, articles in articles_by_player.items()} if article.player is not None:
player_cites_made_ranks = reverse_statistics_dict(player_cite_count) unique_citations = set([a.target for a in article.citations])
cite_count_by_player[article.player] += len(unique_citations)
for addendum in article.addendums:
cite_count_by_player[addendum.player] += len(addendum.citations)
player_cites_made_ranks = reverse_statistics_dict(cite_count_by_player)
player_cites_made_items = itemize(player_cites_made_ranks) player_cites_made_items = itemize(player_cites_made_ranks)
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
content += "<u>Citations made by player</u><br>\n" content += "<u>Citations made by player:</u><br>\n"
content += "<br>\n".join(player_cites_made_items) content += "<br>\n".join(player_cites_made_items)
content += "</div>\n" content += "</div>\n"
@ -252,7 +241,7 @@ def build_statistics_page(page, articles):
cited_times_ranked = reverse_statistics_dict(cited_times) cited_times_ranked = reverse_statistics_dict(cited_times)
cited_times_items = itemize(cited_times_ranked) cited_times_items = itemize(cited_times_ranked)
content += "<div class=\"contentblock\">\n" content += "<div class=\"contentblock\">\n"
content += "<u>Citations made to player</u><br>\n" content += "<u>Citations made to player:</u><br>\n"
content += "<br>\n".join(cited_times_items) content += "<br>\n".join(cited_times_items)
content += "</div>\n" content += "</div>\n"
@ -350,7 +339,7 @@ def build_all(path_prefix, lexicon_name):
# Once they've been populated, the articles list has the titles of all articles # Once they've been populated, the articles list has the titles of all articles
# Sort this by turn before title so prev/next links run in turn order # Sort this by turn before title so prev/next links run in turn order
articles = sorted( articles = sorted(
LexiconArticle.populate(articles), LexiconArticle.interlink(articles),
key=lambda a: (a.turn, utils.titlesort(a.title))) key=lambda a: (a.turn, utils.titlesort(a.title)))
def pathto(*els): def pathto(*els):
@ -372,13 +361,14 @@ def build_all(path_prefix, lexicon_name):
for idx in range(l): for idx in range(l):
article = articles[idx] article = articles[idx]
with open(pathto("article", article.title_filesafe + ".html"), "w", encoding="utf-8") as f: with open(pathto("article", article.title_filesafe + ".html"), "w", encoding="utf-8") as f:
contentblock = article.build_default_contentblock() content = article.build_default_content()
citeblock = article.build_default_citeblock( #contentblock = article.build_default_contentblock()
None if idx == 0 else articles[idx - 1], #citeblock = article.build_default_citeblock(
None if idx == l-1 else articles[idx + 1]) # None if idx == 0 else articles[idx - 1],
# None if idx == l-1 else articles[idx + 1])
article_html = page.format( article_html = page.format(
title = article.title, title = article.title,
content = contentblock + citeblock) content = content)
f.write(article_html) f.write(article_html)
print(" Wrote " + article.title) print(" Wrote " + article.title)
@ -409,10 +399,10 @@ def build_all(path_prefix, lexicon_name):
# Check that authors aren't citing themselves # Check that authors aren't citing themselves
print("Running citation checks...") print("Running citation checks...")
article_by_title = {article.title : article for article in articles} article_by_title = {article.title : article for article in articles}
for article in articles: #for article in articles:
for _, tup in article.citations.items(): # for _, tup in article.citations.items():
cited = article_by_title[tup[1]] # cited = article_by_title[tup[1]]
if article.player == cited.player: # if article.player == cited.player:
print(" {2}: {0} cites {1}".format(article.title, cited.title, cited.player)) # print(" {2}: {0} cites {1}".format(article.title, cited.title, cited.player))
print() print()