Refactor citations and add addendum articles
This commit is contained in:
parent
00cc4e9cfe
commit
7490cd6f7f
230
src/article.py
230
src/article.py
|
@ -3,21 +3,50 @@ import sys
|
|||
import re
|
||||
import src.utils as utils
|
||||
|
||||
class LexiconCitation:
|
||||
"""
|
||||
Represents information about a single citation in a Lexicon article.
|
||||
|
||||
Members:
|
||||
id int: citation id within the article, corresponding to a "{cN}"
|
||||
format hook
|
||||
text string: alias text linked to the citation target
|
||||
target string: title of the article being cited
|
||||
article LexiconArticle: article cited, None until interlink
|
||||
"""
|
||||
def __init__(self, id, citation_text, citation_target, article=None):
|
||||
self.id = id
|
||||
self.text = citation_text
|
||||
self.target = citation_target
|
||||
self.article = article
|
||||
|
||||
def __repr__(self):
|
||||
return "<LexiconCitation(id={0.id}, text=\"{0.text}\", target=\"{0.target}\")>".format(self)
|
||||
|
||||
def __str__(self):
|
||||
return "<[{0.id}]:[[{0.text}|{0.target}]]>".format(self)
|
||||
|
||||
def format(self, format_str):
|
||||
return format_str.format(**self.__dict__)
|
||||
|
||||
class LexiconArticle:
|
||||
"""
|
||||
A Lexicon article and its metadata.
|
||||
|
||||
Members:
|
||||
player string: the player of the article
|
||||
turn integer: the turn the article was written for
|
||||
title string: the article title
|
||||
title_filesafe string: the title, escaped, used for filenames
|
||||
content string: the HTML content, with citations replaced by format hooks
|
||||
citations dict mapping format hook string to tuple of link alias and link target title
|
||||
wcites list: titles of written articles cited
|
||||
pcites list: titles of phantom articles cited
|
||||
citedby list: titles of articles that cite this
|
||||
The last three are filled in by populate().
|
||||
Members defined by __init__:
|
||||
player string: player who wrote the article
|
||||
turn integer: turn the article was written for
|
||||
title string: article title
|
||||
title_filesafe string: title, escaped, used for filenames
|
||||
content string: HTML content, with citations replaced by format hooks
|
||||
citations list of LexiconCitations: citations made by the article
|
||||
link_class string: CSS class to interpolate (for styling phantoms)
|
||||
|
||||
Members undefined until interlink:
|
||||
addendums list of LexiconArticles: addendum articles to this article
|
||||
citedby set of LexiconArticles: articles that cite this article
|
||||
prev_article LexiconArticle: the previous article in read order
|
||||
next_article LexiconArticle: the next article in read order
|
||||
"""
|
||||
|
||||
def __init__(self, player, turn, title, content, citations):
|
||||
|
@ -30,9 +59,17 @@ class LexiconArticle:
|
|||
self.title_filesafe = utils.titleescape(title)
|
||||
self.content = content
|
||||
self.citations = citations
|
||||
self.wcites = set()
|
||||
self.pcites = set()
|
||||
self.link_class = "class=\"phantom\"" if player is None else ""
|
||||
self.addendums = []
|
||||
self.citedby = set()
|
||||
self.prev_article = None
|
||||
self.next_article = None
|
||||
|
||||
def __repr__(self):
|
||||
return "<LexiconArticle(title={0.title}, turn={0.turn}, player={0.player})>".format(self)
|
||||
|
||||
def __str__(self):
|
||||
return "<\"{0.title}\", {0.player} turn {0.turn}>".format(self)
|
||||
|
||||
@staticmethod
|
||||
def from_file_raw(raw_content):
|
||||
|
@ -68,7 +105,7 @@ class LexiconArticle:
|
|||
# Parse the content and extract citations
|
||||
paras = re.split("\n\n+", content_raw.strip())
|
||||
content = ""
|
||||
citations = {}
|
||||
citations = []
|
||||
format_id = 1
|
||||
if not paras:
|
||||
print("No content")
|
||||
|
@ -91,7 +128,8 @@ class LexiconArticle:
|
|||
cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3)
|
||||
cite_title = utils.titlecase(re.sub(r"\s+", " ", link_match.group(3)))
|
||||
# Record the citation
|
||||
citations["c"+str(format_id)] = (cite_text, cite_title)
|
||||
cite = LexiconCitation(format_id, cite_text, cite_title)
|
||||
citations.append(cite)
|
||||
# Stitch the format id in place of the citation
|
||||
para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):]
|
||||
format_id += 1 # Increment to the next format citation
|
||||
|
@ -129,83 +167,125 @@ class LexiconArticle:
|
|||
return articles
|
||||
|
||||
@staticmethod
|
||||
def populate(lexicon_articles):
|
||||
def interlink(lexicon_articles):
|
||||
"""
|
||||
Given a list of lexicon articles, fills out citation information
|
||||
for each article and creates phantom pages for missing articles.
|
||||
Fills out fields on articles that require other articles for context.
|
||||
Creates phantom articles.
|
||||
"""
|
||||
article_by_title = {article.title : article for article in lexicon_articles}
|
||||
# Determine all articles that exist or should exist
|
||||
extant_titles = set([citation[1] for article in lexicon_articles for citation in article.citations])
|
||||
# Interlink all citations
|
||||
for article in lexicon_articles:
|
||||
for cite_tuple in article.citations.values():
|
||||
target = cite_tuple[1]
|
||||
# Create article objects for phantom citations
|
||||
if target not in article_by_title:
|
||||
article_by_title[target] = LexiconArticle(None, sys.maxsize, target,
|
||||
"<p><i>This entry hasn't been written yet.</i></p>", {})
|
||||
# Interlink citations
|
||||
if article_by_title[target].player is None:
|
||||
article.pcites.add(target)
|
||||
else:
|
||||
article.wcites.add(target)
|
||||
article_by_title[target].citedby.add(article.title)
|
||||
return list(article_by_title.values())
|
||||
# Sort out which articles are addendums and which titles are phantoms
|
||||
written_titles = set()
|
||||
cited_titles = set()
|
||||
article_by_title = {}
|
||||
written_articles_ordered = sorted(lexicon_articles, key=lambda a: (a.turn, a.title))
|
||||
for written_article in written_articles_ordered:
|
||||
# Track main articles by title
|
||||
if written_article.title not in written_titles:
|
||||
article_by_title[written_article.title] = written_article
|
||||
written_titles.add(written_article.title)
|
||||
# Append addendums to their parents
|
||||
else:
|
||||
parent = article_by_title[written_article.title]
|
||||
parent.addendums.append(written_article)
|
||||
# Collect all cited titles
|
||||
for citation in written_article.citations:
|
||||
cited_titles.add(citation.target)
|
||||
# Create articles for each phantom title
|
||||
for title in cited_titles - written_titles:
|
||||
phantom_article = LexiconArticle(
|
||||
None, sys.maxsize, title,
|
||||
"<p><i>This entry hasn't been written yet.</i></p>", {})
|
||||
article_by_title[title] = phantom_article
|
||||
# To interlink the articles, each citation needs to have its .article
|
||||
# filled in, and that article needs its citedby updated.
|
||||
for parent in article_by_title.values():
|
||||
under_title = [parent] + parent.addendums
|
||||
for citing_article in under_title:
|
||||
for citation in citing_article.citations:
|
||||
target_article = article_by_title[citation.target]
|
||||
citation.article = target_article
|
||||
target_article.citedby.add(citing_article)
|
||||
# Sort the articles by turn and title, then fill in prev/next fields
|
||||
articles_ordered = sorted(article_by_title.values(), key=lambda a: (a.turn, utils.titlesort(a.title)))
|
||||
for i in range(len(articles_ordered)):
|
||||
articles_ordered[i].prev_article = articles_ordered[i-1] if i != 0 else None
|
||||
articles_ordered[i].next_article = articles_ordered[i+1] if i != len(articles_ordered)-1 else None
|
||||
return articles_ordered
|
||||
|
||||
def build_default_contentblock(self):
|
||||
def build_default_content(self):
|
||||
"""
|
||||
Formats citations into the article content as normal HTML links
|
||||
and returns the result.
|
||||
Builds the contents of the content div for an article page.
|
||||
"""
|
||||
content = ""
|
||||
# Build the main article content block
|
||||
main_body = self.build_default_article_body()
|
||||
content += "<div class=\"contentblock\"><h1>{}</h1>{}</div>\n".format(
|
||||
self.title, main_body)
|
||||
# Build the main citation content block
|
||||
main_citations = self.build_default_citeblock(
|
||||
self.prev_article, self.next_article)
|
||||
if main_citations:
|
||||
content += "<div class=\"contentblock citeblock\">{}</div>\n".format(
|
||||
main_citations)
|
||||
# Build any addendum content blocks
|
||||
for addendum in self.addendums:
|
||||
add_body = addendum.build_default_article_body()
|
||||
content += "<div class=\"contentblock\">{}</div>\n".format(add_body)
|
||||
add_citations = addendum.build_default_citeblock(None, None)
|
||||
if add_citations:
|
||||
content += "<div class=\"contentblock\">{}</div>\n".format(
|
||||
add_citations)
|
||||
return content
|
||||
|
||||
def build_default_article_body(self):
|
||||
"""
|
||||
Formats citations into the article text and returns the article body.
|
||||
"""
|
||||
format_map = {
|
||||
format_id: "<a href=\"{1}.html\"{2}>{0}</a>".format(
|
||||
cite_tuple[0], utils.titleescape(cite_tuple[1]),
|
||||
"" if cite_tuple[1] in self.wcites else " class=\"phantom\"")
|
||||
for format_id, cite_tuple in self.citations.items()
|
||||
"c"+str(c.id) : c.format("<a {article.link_class} "\
|
||||
"href=\"{article.title_filesafe}.html\">{text}</a>")
|
||||
for c in self.citations
|
||||
}
|
||||
article_content = self.content.format(**format_map)
|
||||
return "<div class=\"contentblock\">\n<h1>{}</h1>\n{}</div>\n".format(
|
||||
self.title,
|
||||
article_content)
|
||||
return self.content.format(**format_map)
|
||||
|
||||
def build_default_citeblock(self, prev_article, next_article):
|
||||
"""
|
||||
Builds the citeblock content HTML for use in regular article pages.
|
||||
For each defined target, links the target page as Previous or Next.
|
||||
Builds the contents of a citation contentblock. For each defined target,
|
||||
links the target page as Previous or Next. Prev/next and cites/citedby
|
||||
elements are not included if they have no content.
|
||||
"""
|
||||
citeblock = "<div class=\"contentblock citeblock\">\n"
|
||||
content = ""
|
||||
# Prev/next links:
|
||||
if next_article is not None or prev_article is not None:
|
||||
prev_link = ("<a href=\"{}.html\"{}>← Previous</a>".format(
|
||||
prev_article.title_filesafe,
|
||||
" class=\"phantom\"" if prev_article.player is None else "")
|
||||
prev_link = ("<a {0.link_class} href=\"{0.title_filesafe}.html\">← Previous</a>".format(
|
||||
prev_article)
|
||||
if prev_article is not None else "")
|
||||
next_link = ("<a href=\"{}.html\"{}>Next →</a>".format(
|
||||
next_article.title_filesafe,
|
||||
" class=\"phantom\"" if next_article.player is None else "")
|
||||
next_link = ("<a {0.link_class} href=\"{0.title_filesafe}.html\">Next →</a>".format(
|
||||
next_article)
|
||||
if next_article is not None else "")
|
||||
citeblock += "<table><tr>\n<td>{}</td>\n<td>{}</td>\n</table></tr>\n".format(
|
||||
content += "<table><tr>\n<td>{}</td>\n<td>{}</td>\n</table></tr>\n".format(
|
||||
prev_link, next_link)
|
||||
# Citations
|
||||
cites_links = [
|
||||
"<a href=\"{1}.html\"{2}>{0}</a>".format(
|
||||
title, utils.titleescape(title),
|
||||
"" if title in self.wcites else " class=\"phantom\"")
|
||||
for title in sorted(
|
||||
self.wcites | self.pcites,
|
||||
key=lambda t: utils.titlesort(t))]
|
||||
cites_titles = set()
|
||||
cites_links = []
|
||||
for citation in sorted(self.citations, key=lambda c: (utils.titlesort(c.target), c.id)):
|
||||
if citation.target not in cites_titles:
|
||||
cites_titles.add(citation.target)
|
||||
cites_links.append(
|
||||
citation.format(
|
||||
"<a {article.link_class} href=\"{article.title_filesafe}.html\">{article.title}</a>"))
|
||||
cites_str = " / ".join(cites_links)
|
||||
if len(cites_str) < 1: cites_str = "—"
|
||||
citeblock += "<p>Citations: {}</p>\n".format(cites_str)
|
||||
if len(cites_str) > 0:
|
||||
content += "<p>Citations: {}</p>\n".format(cites_str)
|
||||
# Citedby
|
||||
citedby_links = [
|
||||
"<a href=\"{1}.html\">{0}</a>".format(
|
||||
title, utils.titleescape(title))
|
||||
for title in sorted(
|
||||
self.citedby,
|
||||
key=lambda t: utils.titlesort(t))]
|
||||
citedby_titles = set()
|
||||
citedby_links = []
|
||||
for article in sorted(self.citedby, key=lambda a: (utils.titlesort(a.title), a.turn)):
|
||||
if article.title not in citedby_titles:
|
||||
citedby_titles.add(article.title)
|
||||
citedby_links.append(
|
||||
"<a {0.link_class} href=\"{0.title_filesafe}.html\">{0.title}</a>".format(article))
|
||||
citedby_str = " / ".join(citedby_links)
|
||||
if len(citedby_str) < 1: citedby_str = "—"
|
||||
citeblock += "<p>Cited by: {}</p>\n</div>\n".format(citedby_str)
|
||||
return citeblock
|
||||
if len(citedby_str) > 0:
|
||||
content += "<p>Cited by: {}</p>\n".format(citedby_str)
|
||||
|
||||
return content
|
||||
|
|
110
src/build.py
110
src/build.py
|
@ -133,20 +133,13 @@ def build_statistics_page(page, articles):
|
|||
Builds the full HTML of the statistics page.
|
||||
"""
|
||||
content = ""
|
||||
cite_map = {
|
||||
article.title : [
|
||||
cite_tuple[1]
|
||||
for cite_tuple
|
||||
in article.citations.values()
|
||||
]
|
||||
for article in articles}
|
||||
|
||||
# Top pages by pagerank
|
||||
# Compute pagerank for each article
|
||||
G = networkx.Graph()
|
||||
for citer, citeds in cite_map.items():
|
||||
for cited in citeds:
|
||||
G.add_edge(citer, cited)
|
||||
for article in articles:
|
||||
for citation in article.citations:
|
||||
G.add_edge(article.title, citation.target)
|
||||
rank_by_article = networkx.pagerank(G)
|
||||
# Get the top ten articles by pagerank
|
||||
top_pageranks = reverse_statistics_dict(rank_by_article)[:10]
|
||||
|
@ -156,34 +149,25 @@ def build_statistics_page(page, articles):
|
|||
top_ranked_items = itemize(top_ranked)
|
||||
# Write the statistics to the page
|
||||
content += "<div class=\"contentblock\">\n"
|
||||
content += "<u>Top 10 pages by page rank:</u><br>\n"
|
||||
content += "<u>Top 10 articles by page rank:</u><br>\n"
|
||||
content += "<br>\n".join(top_ranked_items)
|
||||
content += "</div>\n"
|
||||
|
||||
# Top number of citations made
|
||||
citations_made = { title : len(cites) for title, cites in cite_map.items() }
|
||||
citations_made = {article.title : len(article.citations) for article in articles}
|
||||
top_citations = reverse_statistics_dict(citations_made)[:3]
|
||||
top_citations_items = itemize(top_citations)
|
||||
content += "<div class=\"contentblock\">\n"
|
||||
content += "<u>Most citations made from:</u><br>\n"
|
||||
content += "<u>Top articles by citations made:</u><br>\n"
|
||||
content += "<br>\n".join(top_citations_items)
|
||||
content += "</div>\n"
|
||||
|
||||
# Top number of times cited
|
||||
# Build a map of what cites each article
|
||||
all_cited = set([title for citeds in cite_map.values() for title in citeds])
|
||||
cited_by_map = {
|
||||
cited: [
|
||||
citer
|
||||
for citer in cite_map.keys()
|
||||
if cited in cite_map[citer]]
|
||||
for cited in all_cited }
|
||||
# Compute the number of citations to each article
|
||||
citations_to = { title : len(cites) for title, cites in cited_by_map.items() }
|
||||
citations_to = {article.title : len(article.citedby) for article in articles}
|
||||
top_cited = reverse_statistics_dict(citations_to)[:3]
|
||||
top_cited_items = itemize(top_cited)
|
||||
content += "<div class=\"contentblock\">\n"
|
||||
content += "<u>Most citations made to:</u><br>\n"
|
||||
content += "<u>Most cited articles:</u><br>\n"
|
||||
content += "<br>\n".join(top_cited_items)
|
||||
content += "</div>\n"
|
||||
|
||||
|
@ -191,16 +175,15 @@ def build_statistics_page(page, articles):
|
|||
article_length = {}
|
||||
for article in articles:
|
||||
format_map = {
|
||||
format_id: cite_tuple[0]
|
||||
for format_id, cite_tuple in article.citations.items()
|
||||
"c"+str(c.id): c.text
|
||||
for c in article.citations
|
||||
}
|
||||
plain_content = article.content.format(**format_map)
|
||||
wordcount = len(plain_content.split())
|
||||
article_length[article.title] = wordcount
|
||||
article_length[article.title] = len(plain_content.split())
|
||||
top_length = reverse_statistics_dict(article_length)[:3]
|
||||
top_length_items = itemize(top_length)
|
||||
content += "<div class=\"contentblock\">\n"
|
||||
content += "<u>Longest article:</u><br>\n"
|
||||
content += "<u>Longest articles:</u><br>\n"
|
||||
content += "<br>\n".join(top_length_items)
|
||||
content += "</div>\n"
|
||||
|
||||
|
@ -211,21 +194,23 @@ def build_statistics_page(page, articles):
|
|||
content += "</div>\n"
|
||||
|
||||
# Player pageranks
|
||||
# Add addendums and recompute pagerank
|
||||
for article in articles:
|
||||
for addendum in article.addendums:
|
||||
for citation in addendum.citations:
|
||||
addendum_title = "{0.title}-T{0.turn}".format(addendum)
|
||||
G.add_edge(addendum_title, citation.target)
|
||||
rank_by_article = networkx.pagerank(G)
|
||||
players = sorted(set([article.player for article in articles if article.player is not None]))
|
||||
articles_by_player = {
|
||||
player : [
|
||||
a
|
||||
for a in articles
|
||||
if a.player == player]
|
||||
for player in players}
|
||||
pagerank_by_player = {
|
||||
player : round(
|
||||
sum(map(
|
||||
lambda a: rank_by_article[a.title] if a.title in rank_by_article else 0,
|
||||
articles)),
|
||||
3)
|
||||
for player, articles
|
||||
in articles_by_player.items()}
|
||||
pagerank_by_player = {player: 0 for player in players}
|
||||
for article in articles:
|
||||
if article.player is not None:
|
||||
pagerank_by_player[article.player] += rank_by_article[article.title]
|
||||
for addendum in article.addendums:
|
||||
addendum_title = "{0.title}-T{0.turn}".format(addendum)
|
||||
pagerank_by_player[addendum_title] += rank_by_article[addendum_title]
|
||||
for player in players:
|
||||
pagerank_by_player[player] = round(pagerank_by_player[player], 3)
|
||||
player_rank = reverse_statistics_dict(pagerank_by_player)
|
||||
player_rank_items = itemize(player_rank)
|
||||
content += "<div class=\"contentblock\">\n"
|
||||
|
@ -234,13 +219,17 @@ def build_statistics_page(page, articles):
|
|||
content += "</div>\n"
|
||||
|
||||
# Player citations made
|
||||
player_cite_count = {
|
||||
player : sum(map(lambda a:len(a.wcites | a.pcites), articles))
|
||||
for player, articles in articles_by_player.items()}
|
||||
player_cites_made_ranks = reverse_statistics_dict(player_cite_count)
|
||||
cite_count_by_player = {player: 0 for player in players}
|
||||
for article in articles:
|
||||
if article.player is not None:
|
||||
unique_citations = set([a.target for a in article.citations])
|
||||
cite_count_by_player[article.player] += len(unique_citations)
|
||||
for addendum in article.addendums:
|
||||
cite_count_by_player[addendum.player] += len(addendum.citations)
|
||||
player_cites_made_ranks = reverse_statistics_dict(cite_count_by_player)
|
||||
player_cites_made_items = itemize(player_cites_made_ranks)
|
||||
content += "<div class=\"contentblock\">\n"
|
||||
content += "<u>Citations made by player</u><br>\n"
|
||||
content += "<u>Citations made by player:</u><br>\n"
|
||||
content += "<br>\n".join(player_cites_made_items)
|
||||
content += "</div>\n"
|
||||
|
||||
|
@ -252,7 +241,7 @@ def build_statistics_page(page, articles):
|
|||
cited_times_ranked = reverse_statistics_dict(cited_times)
|
||||
cited_times_items = itemize(cited_times_ranked)
|
||||
content += "<div class=\"contentblock\">\n"
|
||||
content += "<u>Citations made to player</u><br>\n"
|
||||
content += "<u>Citations made to player:</u><br>\n"
|
||||
content += "<br>\n".join(cited_times_items)
|
||||
content += "</div>\n"
|
||||
|
||||
|
@ -350,7 +339,7 @@ def build_all(path_prefix, lexicon_name):
|
|||
# Once they've been populated, the articles list has the titles of all articles
|
||||
# Sort this by turn before title so prev/next links run in turn order
|
||||
articles = sorted(
|
||||
LexiconArticle.populate(articles),
|
||||
LexiconArticle.interlink(articles),
|
||||
key=lambda a: (a.turn, utils.titlesort(a.title)))
|
||||
|
||||
def pathto(*els):
|
||||
|
@ -372,13 +361,14 @@ def build_all(path_prefix, lexicon_name):
|
|||
for idx in range(l):
|
||||
article = articles[idx]
|
||||
with open(pathto("article", article.title_filesafe + ".html"), "w", encoding="utf-8") as f:
|
||||
contentblock = article.build_default_contentblock()
|
||||
citeblock = article.build_default_citeblock(
|
||||
None if idx == 0 else articles[idx - 1],
|
||||
None if idx == l-1 else articles[idx + 1])
|
||||
content = article.build_default_content()
|
||||
#contentblock = article.build_default_contentblock()
|
||||
#citeblock = article.build_default_citeblock(
|
||||
# None if idx == 0 else articles[idx - 1],
|
||||
# None if idx == l-1 else articles[idx + 1])
|
||||
article_html = page.format(
|
||||
title = article.title,
|
||||
content = contentblock + citeblock)
|
||||
content = content)
|
||||
f.write(article_html)
|
||||
print(" Wrote " + article.title)
|
||||
|
||||
|
@ -409,10 +399,10 @@ def build_all(path_prefix, lexicon_name):
|
|||
# Check that authors aren't citing themselves
|
||||
print("Running citation checks...")
|
||||
article_by_title = {article.title : article for article in articles}
|
||||
for article in articles:
|
||||
for _, tup in article.citations.items():
|
||||
cited = article_by_title[tup[1]]
|
||||
if article.player == cited.player:
|
||||
print(" {2}: {0} cites {1}".format(article.title, cited.title, cited.player))
|
||||
#for article in articles:
|
||||
# for _, tup in article.citations.items():
|
||||
# cited = article_by_title[tup[1]]
|
||||
# if article.player == cited.player:
|
||||
# print(" {2}: {0} cites {1}".format(article.title, cited.title, cited.player))
|
||||
|
||||
print()
|
||||
|
|
Loading…
Reference in New Issue