From 7490cd6f7fa289f44eb5245d6c7d554cc62fd28b Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 31 Oct 2018 15:22:15 -0700 Subject: [PATCH] Refactor citations and add addendum articles --- src/article.py | 230 +++++++++++++++++++++++++++++++++---------------- src/build.py | 110 +++++++++++------------ 2 files changed, 205 insertions(+), 135 deletions(-) diff --git a/src/article.py b/src/article.py index 9a0b09f..6df114d 100644 --- a/src/article.py +++ b/src/article.py @@ -3,21 +3,50 @@ import sys import re import src.utils as utils +class LexiconCitation: + """ + Represents information about a single citation in a Lexicon article. + + Members: + id int: citation id within the article, corresponding to a "{cN}" + format hook + text string: alias text linked to the citation target + target string: title of the article being cited + article LexiconArticle: article cited, None until interlink + """ + def __init__(self, id, citation_text, citation_target, article=None): + self.id = id + self.text = citation_text + self.target = citation_target + self.article = article + + def __repr__(self): + return "".format(self) + + def __str__(self): + return "<[{0.id}]:[[{0.text}|{0.target}]]>".format(self) + + def format(self, format_str): + return format_str.format(**self.__dict__) + class LexiconArticle: """ A Lexicon article and its metadata. - Members: - player string: the player of the article - turn integer: the turn the article was written for - title string: the article title - title_filesafe string: the title, escaped, used for filenames - content string: the HTML content, with citations replaced by format hooks - citations dict mapping format hook string to tuple of link alias and link target title - wcites list: titles of written articles cited - pcites list: titles of phantom articles cited - citedby list: titles of articles that cite this - The last three are filled in by populate(). + Members defined by __init__: + player string: player who wrote the article + turn integer: turn the article was written for + title string: article title + title_filesafe string: title, escaped, used for filenames + content string: HTML content, with citations replaced by format hooks + citations list of LexiconCitations: citations made by the article + link_class string: CSS class to interpolate (for styling phantoms) + + Members undefined until interlink: + addendums list of LexiconArticles: addendum articles to this article + citedby set of LexiconArticles: articles that cite this article + prev_article LexiconArticle: the previous article in read order + next_article LexiconArticle: the next article in read order """ def __init__(self, player, turn, title, content, citations): @@ -30,9 +59,17 @@ class LexiconArticle: self.title_filesafe = utils.titleescape(title) self.content = content self.citations = citations - self.wcites = set() - self.pcites = set() + self.link_class = "class=\"phantom\"" if player is None else "" + self.addendums = [] self.citedby = set() + self.prev_article = None + self.next_article = None + + def __repr__(self): + return "".format(self) + + def __str__(self): + return "<\"{0.title}\", {0.player} turn {0.turn}>".format(self) @staticmethod def from_file_raw(raw_content): @@ -68,7 +105,7 @@ class LexiconArticle: # Parse the content and extract citations paras = re.split("\n\n+", content_raw.strip()) content = "" - citations = {} + citations = [] format_id = 1 if not paras: print("No content") @@ -91,7 +128,8 @@ class LexiconArticle: cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3) cite_title = utils.titlecase(re.sub(r"\s+", " ", link_match.group(3))) # Record the citation - citations["c"+str(format_id)] = (cite_text, cite_title) + cite = LexiconCitation(format_id, cite_text, cite_title) + citations.append(cite) # Stitch the format id in place of the citation para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):] format_id += 1 # Increment to the next format citation @@ -129,83 +167,125 @@ class LexiconArticle: return articles @staticmethod - def populate(lexicon_articles): + def interlink(lexicon_articles): """ - Given a list of lexicon articles, fills out citation information - for each article and creates phantom pages for missing articles. + Fills out fields on articles that require other articles for context. + Creates phantom articles. """ - article_by_title = {article.title : article for article in lexicon_articles} - # Determine all articles that exist or should exist - extant_titles = set([citation[1] for article in lexicon_articles for citation in article.citations]) - # Interlink all citations - for article in lexicon_articles: - for cite_tuple in article.citations.values(): - target = cite_tuple[1] - # Create article objects for phantom citations - if target not in article_by_title: - article_by_title[target] = LexiconArticle(None, sys.maxsize, target, - "

This entry hasn't been written yet.

", {}) - # Interlink citations - if article_by_title[target].player is None: - article.pcites.add(target) - else: - article.wcites.add(target) - article_by_title[target].citedby.add(article.title) - return list(article_by_title.values()) + # Sort out which articles are addendums and which titles are phantoms + written_titles = set() + cited_titles = set() + article_by_title = {} + written_articles_ordered = sorted(lexicon_articles, key=lambda a: (a.turn, a.title)) + for written_article in written_articles_ordered: + # Track main articles by title + if written_article.title not in written_titles: + article_by_title[written_article.title] = written_article + written_titles.add(written_article.title) + # Append addendums to their parents + else: + parent = article_by_title[written_article.title] + parent.addendums.append(written_article) + # Collect all cited titles + for citation in written_article.citations: + cited_titles.add(citation.target) + # Create articles for each phantom title + for title in cited_titles - written_titles: + phantom_article = LexiconArticle( + None, sys.maxsize, title, + "

This entry hasn't been written yet.

", {}) + article_by_title[title] = phantom_article + # To interlink the articles, each citation needs to have its .article + # filled in, and that article needs its citedby updated. + for parent in article_by_title.values(): + under_title = [parent] + parent.addendums + for citing_article in under_title: + for citation in citing_article.citations: + target_article = article_by_title[citation.target] + citation.article = target_article + target_article.citedby.add(citing_article) + # Sort the articles by turn and title, then fill in prev/next fields + articles_ordered = sorted(article_by_title.values(), key=lambda a: (a.turn, utils.titlesort(a.title))) + for i in range(len(articles_ordered)): + articles_ordered[i].prev_article = articles_ordered[i-1] if i != 0 else None + articles_ordered[i].next_article = articles_ordered[i+1] if i != len(articles_ordered)-1 else None + return articles_ordered - def build_default_contentblock(self): + def build_default_content(self): """ - Formats citations into the article content as normal HTML links - and returns the result. + Builds the contents of the content div for an article page. + """ + content = "" + # Build the main article content block + main_body = self.build_default_article_body() + content += "

{}

{}
\n".format( + self.title, main_body) + # Build the main citation content block + main_citations = self.build_default_citeblock( + self.prev_article, self.next_article) + if main_citations: + content += "
{}
\n".format( + main_citations) + # Build any addendum content blocks + for addendum in self.addendums: + add_body = addendum.build_default_article_body() + content += "
{}
\n".format(add_body) + add_citations = addendum.build_default_citeblock(None, None) + if add_citations: + content += "
{}
\n".format( + add_citations) + return content + + def build_default_article_body(self): + """ + Formats citations into the article text and returns the article body. """ format_map = { - format_id: "{0}".format( - cite_tuple[0], utils.titleescape(cite_tuple[1]), - "" if cite_tuple[1] in self.wcites else " class=\"phantom\"") - for format_id, cite_tuple in self.citations.items() + "c"+str(c.id) : c.format("{text}") + for c in self.citations } - article_content = self.content.format(**format_map) - return "
\n

{}

\n{}
\n".format( - self.title, - article_content) + return self.content.format(**format_map) def build_default_citeblock(self, prev_article, next_article): """ - Builds the citeblock content HTML for use in regular article pages. - For each defined target, links the target page as Previous or Next. + Builds the contents of a citation contentblock. For each defined target, + links the target page as Previous or Next. Prev/next and cites/citedby + elements are not included if they have no content. """ - citeblock = "
\n" + content = "" # Prev/next links: if next_article is not None or prev_article is not None: - prev_link = ("← Previous".format( - prev_article.title_filesafe, - " class=\"phantom\"" if prev_article.player is None else "") + prev_link = ("← Previous".format( + prev_article) if prev_article is not None else "") - next_link = ("Next →".format( - next_article.title_filesafe, - " class=\"phantom\"" if next_article.player is None else "") + next_link = ("Next →".format( + next_article) if next_article is not None else "") - citeblock += "\n\n\n
{}{}
\n".format( + content += "\n\n\n
{}{}
\n".format( prev_link, next_link) # Citations - cites_links = [ - "{0}".format( - title, utils.titleescape(title), - "" if title in self.wcites else " class=\"phantom\"") - for title in sorted( - self.wcites | self.pcites, - key=lambda t: utils.titlesort(t))] + cites_titles = set() + cites_links = [] + for citation in sorted(self.citations, key=lambda c: (utils.titlesort(c.target), c.id)): + if citation.target not in cites_titles: + cites_titles.add(citation.target) + cites_links.append( + citation.format( + "{article.title}")) cites_str = " / ".join(cites_links) - if len(cites_str) < 1: cites_str = "—" - citeblock += "

Citations: {}

\n".format(cites_str) + if len(cites_str) > 0: + content += "

Citations: {}

\n".format(cites_str) # Citedby - citedby_links = [ - "{0}".format( - title, utils.titleescape(title)) - for title in sorted( - self.citedby, - key=lambda t: utils.titlesort(t))] + citedby_titles = set() + citedby_links = [] + for article in sorted(self.citedby, key=lambda a: (utils.titlesort(a.title), a.turn)): + if article.title not in citedby_titles: + citedby_titles.add(article.title) + citedby_links.append( + "{0.title}".format(article)) citedby_str = " / ".join(citedby_links) - if len(citedby_str) < 1: citedby_str = "—" - citeblock += "

Cited by: {}

\n
\n".format(citedby_str) - return citeblock + if len(citedby_str) > 0: + content += "

Cited by: {}

\n".format(citedby_str) + + return content diff --git a/src/build.py b/src/build.py index bdab5ca..40413e5 100644 --- a/src/build.py +++ b/src/build.py @@ -133,20 +133,13 @@ def build_statistics_page(page, articles): Builds the full HTML of the statistics page. """ content = "" - cite_map = { - article.title : [ - cite_tuple[1] - for cite_tuple - in article.citations.values() - ] - for article in articles} # Top pages by pagerank # Compute pagerank for each article G = networkx.Graph() - for citer, citeds in cite_map.items(): - for cited in citeds: - G.add_edge(citer, cited) + for article in articles: + for citation in article.citations: + G.add_edge(article.title, citation.target) rank_by_article = networkx.pagerank(G) # Get the top ten articles by pagerank top_pageranks = reverse_statistics_dict(rank_by_article)[:10] @@ -156,34 +149,25 @@ def build_statistics_page(page, articles): top_ranked_items = itemize(top_ranked) # Write the statistics to the page content += "
\n" - content += "Top 10 pages by page rank:
\n" + content += "Top 10 articles by page rank:
\n" content += "
\n".join(top_ranked_items) content += "
\n" # Top number of citations made - citations_made = { title : len(cites) for title, cites in cite_map.items() } + citations_made = {article.title : len(article.citations) for article in articles} top_citations = reverse_statistics_dict(citations_made)[:3] top_citations_items = itemize(top_citations) content += "
\n" - content += "Most citations made from:
\n" + content += "Top articles by citations made:
\n" content += "
\n".join(top_citations_items) content += "
\n" # Top number of times cited - # Build a map of what cites each article - all_cited = set([title for citeds in cite_map.values() for title in citeds]) - cited_by_map = { - cited: [ - citer - for citer in cite_map.keys() - if cited in cite_map[citer]] - for cited in all_cited } - # Compute the number of citations to each article - citations_to = { title : len(cites) for title, cites in cited_by_map.items() } + citations_to = {article.title : len(article.citedby) for article in articles} top_cited = reverse_statistics_dict(citations_to)[:3] top_cited_items = itemize(top_cited) content += "
\n" - content += "Most citations made to:
\n" + content += "Most cited articles:
\n" content += "
\n".join(top_cited_items) content += "
\n" @@ -191,16 +175,15 @@ def build_statistics_page(page, articles): article_length = {} for article in articles: format_map = { - format_id: cite_tuple[0] - for format_id, cite_tuple in article.citations.items() + "c"+str(c.id): c.text + for c in article.citations } plain_content = article.content.format(**format_map) - wordcount = len(plain_content.split()) - article_length[article.title] = wordcount + article_length[article.title] = len(plain_content.split()) top_length = reverse_statistics_dict(article_length)[:3] top_length_items = itemize(top_length) content += "
\n" - content += "Longest article:
\n" + content += "Longest articles:
\n" content += "
\n".join(top_length_items) content += "
\n" @@ -211,21 +194,23 @@ def build_statistics_page(page, articles): content += "\n" # Player pageranks + # Add addendums and recompute pagerank + for article in articles: + for addendum in article.addendums: + for citation in addendum.citations: + addendum_title = "{0.title}-T{0.turn}".format(addendum) + G.add_edge(addendum_title, citation.target) + rank_by_article = networkx.pagerank(G) players = sorted(set([article.player for article in articles if article.player is not None])) - articles_by_player = { - player : [ - a - for a in articles - if a.player == player] - for player in players} - pagerank_by_player = { - player : round( - sum(map( - lambda a: rank_by_article[a.title] if a.title in rank_by_article else 0, - articles)), - 3) - for player, articles - in articles_by_player.items()} + pagerank_by_player = {player: 0 for player in players} + for article in articles: + if article.player is not None: + pagerank_by_player[article.player] += rank_by_article[article.title] + for addendum in article.addendums: + addendum_title = "{0.title}-T{0.turn}".format(addendum) + pagerank_by_player[addendum_title] += rank_by_article[addendum_title] + for player in players: + pagerank_by_player[player] = round(pagerank_by_player[player], 3) player_rank = reverse_statistics_dict(pagerank_by_player) player_rank_items = itemize(player_rank) content += "
\n" @@ -234,13 +219,17 @@ def build_statistics_page(page, articles): content += "
\n" # Player citations made - player_cite_count = { - player : sum(map(lambda a:len(a.wcites | a.pcites), articles)) - for player, articles in articles_by_player.items()} - player_cites_made_ranks = reverse_statistics_dict(player_cite_count) + cite_count_by_player = {player: 0 for player in players} + for article in articles: + if article.player is not None: + unique_citations = set([a.target for a in article.citations]) + cite_count_by_player[article.player] += len(unique_citations) + for addendum in article.addendums: + cite_count_by_player[addendum.player] += len(addendum.citations) + player_cites_made_ranks = reverse_statistics_dict(cite_count_by_player) player_cites_made_items = itemize(player_cites_made_ranks) content += "
\n" - content += "Citations made by player
\n" + content += "Citations made by player:
\n" content += "
\n".join(player_cites_made_items) content += "
\n" @@ -252,7 +241,7 @@ def build_statistics_page(page, articles): cited_times_ranked = reverse_statistics_dict(cited_times) cited_times_items = itemize(cited_times_ranked) content += "
\n" - content += "Citations made to player
\n" + content += "Citations made to player:
\n" content += "
\n".join(cited_times_items) content += "
\n" @@ -350,7 +339,7 @@ def build_all(path_prefix, lexicon_name): # Once they've been populated, the articles list has the titles of all articles # Sort this by turn before title so prev/next links run in turn order articles = sorted( - LexiconArticle.populate(articles), + LexiconArticle.interlink(articles), key=lambda a: (a.turn, utils.titlesort(a.title))) def pathto(*els): @@ -372,13 +361,14 @@ def build_all(path_prefix, lexicon_name): for idx in range(l): article = articles[idx] with open(pathto("article", article.title_filesafe + ".html"), "w", encoding="utf-8") as f: - contentblock = article.build_default_contentblock() - citeblock = article.build_default_citeblock( - None if idx == 0 else articles[idx - 1], - None if idx == l-1 else articles[idx + 1]) + content = article.build_default_content() + #contentblock = article.build_default_contentblock() + #citeblock = article.build_default_citeblock( + # None if idx == 0 else articles[idx - 1], + # None if idx == l-1 else articles[idx + 1]) article_html = page.format( title = article.title, - content = contentblock + citeblock) + content = content) f.write(article_html) print(" Wrote " + article.title) @@ -409,10 +399,10 @@ def build_all(path_prefix, lexicon_name): # Check that authors aren't citing themselves print("Running citation checks...") article_by_title = {article.title : article for article in articles} - for article in articles: - for _, tup in article.citations.items(): - cited = article_by_title[tup[1]] - if article.player == cited.player: - print(" {2}: {0} cites {1}".format(article.title, cited.title, cited.player)) + #for article in articles: + # for _, tup in article.citations.items(): + # cited = article_by_title[tup[1]] + # if article.player == cited.player: + # print(" {2}: {0} cites {1}".format(article.title, cited.title, cited.player)) print()