###############################
## Lexipython Lexicon engine ##
###############################
import sys # For argv and stderr
import os # For reading directories
import re # For parsing lex content
import io # For writing pages out as UTF-8
import networkx # For pagerank analytics
from collections import defaultdict # For rank inversion in statistics
from urllib import parse
# Short utility functions for handling titles
def titlecase(s):
"""Enforces capitalization of titles."""
s = s.strip()
return s[:1].capitalize() + s[1:]
def titleescape(s):
"""Makes an article title filename-safe."""
s = s.strip()
s = re.sub(r"\s+", '_', s) # Replace whitespace with _
s = parse.quote(s) # Encode all other characters
s = re.sub(r"%", "", s) # Strip encoding %s
if len(s) > 64: # If the result is unreasonably long,
s = hex(abs(hash(s)))[2:] # Replace it with a hex hash
return s
def titlestrip(s):
"""Strips certain prefixes for title sorting."""
if s.startswith("The "): return s[4:]
if s.startswith("An "): return s[3:]
if s.startswith("A "): return s[2:]
return s
# Main article class
class LexiconArticle:
"""
A Lexicon article and its metadata.
Members:
author string: the author of the article
turn integer: the turn the article was written for
title string: the article title
title_filesafe string: the title, escaped, used for filenames
content string: the HTML content, with citations replaced by format hooks
citations dict from format hook string to tuple of link alias and link target title
wcites list: titles of written articles cited
pcites list: titles of phantom articles cited
citedby list: titles of articles that cite this
The last three are filled in by populate().
"""
def __init__(self, author, turn, title, content, citations):
"""
Creates a LexiconArticle object with the given parameters.
"""
self.author = author
self.turn = turn
self.title = title
self.title_filesafe = titleescape(title)
self.content = content
self.citations = citations
self.wcites = set()
self.pcites = set()
self.citedby = set()
@staticmethod
def from_file_raw(raw_content):
"""
Parses the contents of a Lexipython source file into a LexiconArticle
object. If the source file is malformed, returns None.
"""
headers = raw_content.split('\n', 3)
if len(headers) != 4:
print("Header read error")
return None
author_header, turn_header, title_header, content_raw = headers
# Validate and sanitize the author header
if not author_header.startswith("# Author:"):
print("Author header missing")
return None
author = author_header[9:].strip()
# Validate and sanitize the turn header
if not turn_header.startswith("# Turn:"):
print("Turn header missing")
return None
turn = None
try:
turn = int(turn_header[7:].strip())
except:
print("Turn header error")
return None
# Validate and sanitize the title header
if not title_header.startswith("# Title:"):
print("Title header missing")
return None
title = titlecase(title_header[8:])
# Parse the content and extract citations
paras = re.split("\n\n+", content_raw.strip())
content = ""
citations = {}
format_id = 1
if not paras:
print("No content")
for para in paras:
# Escape angle brackets
para = re.sub("<", "<", para)
para = re.sub(">", ">", para)
# Replace bold and italic marks with tags
para = re.sub(r"//([^/]+)//", r"\1", para)
para = re.sub(r"\*\*([^*]+)\*\*", r"\1", para)
# Replace \\LF with
LF
para = re.sub(r"\\\\\n", "
\n", para)
# Abstract citations into the citation record
link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para)
while link_match:
# Identify the citation text and cited article
cite_text = link_match.group(2) if link_match.group(2) else link_match.group(3)
cite_title = titlecase(link_match.group(3))
# Record the citation
citations["c"+str(format_id)] = (cite_text, cite_title)
# Stitch the format id in place of the citation
para = para[:link_match.start(0)] + "{c"+str(format_id)+"}" + para[link_match.end(0):]
format_id += 1 # Increment to the next format citation
link_match = re.search(r"\[\[(([^|\[\]]+)\|)?([^|\[\]]+)\]\]", para)
# Convert signature to right-aligned
if para[:1] == '~':
para = "
" + para[1:] + "
\n" else: para = "" + para + "
\n" content += para return LexiconArticle(author, turn, title, content, citations) def build_page_content(self): """ Formats citations into the article content as normal HTML links and returns the result. """ format_map = { format_id: "{0}".format( cite_tuple[0], titleescape(cite_tuple[1]), "" if cite_tuple[1] in self.wcites else " class=\"phantom\"") for format_id, cite_tuple in self.citations.items() } return self.content.format(**format_map) def build_page_citeblock(self, prev_target, next_target): """ Builds the citeblock content HTML for use in regular article pages. For each defined target, links the target page as Previous or Next. """ citeblock = "\n" # Citations cites_links = [ "{0}".format( title, titleescape(title), "" if title in self.wcites else " class=\"phantom\"") for title in sorted(self.wcites | self.pcites)] cites_str = " | ".join(cites_links) if len(cites_str) < 1: cites_str = "--" citeblock += "
Citations: {}
\n".format(cites_str) # Citedby citedby_links = [ "{0}".format( title, titleescape(title)) for title in self.citedby] citedby_str = " | ".join(citedby_links) if len(citedby_str) < 1: citedby_str = "--" citeblock += "Cited by: {}
\nThis entry hasn't been written yet.
", {}) # Interlink citations if article_by_title[target].author is None: article.pcites.add(target) else: article.wcites.add(target) article_by_title[target].citedby.add(article.title) return list(article_by_title.values()) def load_resource(filename, cache={}): """Loads files from the resources directory with caching.""" if filename not in cache: cache[filename] = open("resources/" + filename, "r", encoding="utf8").read() return cache[filename] def load_config(): """Loads values from the config file.""" config = {} with open("lexicon.cfg", "r", encoding="utf8") as f: line = f.readline() while line: # Skim lines until a value definition begins conf_match = re.match(">>>([^>]+)>>>\s+", line) if not conf_match: line = f.readline() continue # Accumulate the conf value until the value ends conf = conf_match.group(1) conf_value = "" line = f.readline() conf_match = re.match("<<<{0}<<<\s+".format(conf), line) while line and not conf_match: conf_value += line line = f.readline() conf_match = re.match("<<<{0}<<<\s+".format(conf), line) if not line: raise SystemExit("Reached EOF while reading config value {}".format(conf)) config[conf] = conf_value.strip() # Check that all necessary values were configured for config_value in ['LEXICON_TITLE', 'PROMPT', 'SESSION_PAGE', "INDEX_LIST"]: if config_value not in config: raise SystemExit("Error: {} not set in lexipython.cfg".format(config_value)) return config # Build functions def build_contents_page(articles, config): """ Builds the full HTML of the contents page. """ content = "" # Article counts phantom_count = len([article for article in articles if article.author is None]) if phantom_count == 0: content = "There are {0} entries in this lexicon.
\n".format(len(articles)) else: content = "There are {0} entries, {1} written and {2} phantom.
\n".format( len(articles), len(articles) - phantom_count, phantom_count) # Prepare article links link_by_title = {article.title : "{0}".format( article.title, article.title_filesafe, "" if article.author is not None else " class=\"phantom\"") for article in articles} # Write the articles in alphabetical order content += load_resource("contents.html") content += "Top 10 pages by page rank:
\n"
G = networkx.Graph()
for citer, citeds in cite_map.items():
for cited in citeds:
G.add_edge(citer, cited)
ranks = networkx.pagerank(G)
sranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
ranking = list(enumerate(map(lambda x: x[0], sranks)))
content += "
\n".join(map(lambda x: "{0} – {1}".format(x[0]+1, x[1]), ranking[:10]))
content += "
Most citations made from:
\n"
citation_tally = [(kv[0], len(kv[1])) for kv in cite_map.items()]
citation_count = defaultdict(list)
for title, count in citation_tally: citation_count[count].append(title)
content += "
\n".join(map(
lambda kv: "{0} – {1}".format(kv[0], "; ".join(kv[1])),
sorted(citation_count.items(), reverse=True)[:3]))
content += "
Most citations made to:
\n"
all_cited = set([title for cites in cite_map.values() for title in cites])
cited_by_map = { cited: [citer for citer in cite_map.keys() if cited in cite_map[citer]] for cited in all_cited }
cited_tally = [(kv[0], len(kv[1])) for kv in cited_by_map.items()]
cited_count = defaultdict(list)
for title, count in cited_tally: cited_count[count].append(title)
content += "
\n".join(map(
lambda kv: "{0} – {1}".format(kv[0], "; ".join(kv[1])),
sorted(cited_count.items(), reverse=True)[:3]))
content += "
Author total page rank:
\n"
authors = sorted(set([article.author for article in articles if article.author is not None]))
articles_by = {author : [a for a in articles if a.author == author] for author in authors}
author_rank = {author : sum(map(lambda a: ranks[a.title], articles)) for author, articles in articles_by.items()}
content += "
\n".join(map(
lambda kv: "{0} – {1}".format(kv[0], round(kv[1], 3)),
sorted(author_rank.items(), key=lambda t:-t[1])))
content += "
Citations made by author
\n"
author_cite_count = {author : sum(map(lambda a:len(a.wcites | a.pcites), articles)) for author, articles in articles_by.items()}
content += "
\n".join(map(
lambda kv: "{0} – {1}".format(kv[0], kv[1]),
sorted(author_cite_count.items(), key=lambda t:-t[1])))
content += "
Citations made to author
\n"
cited_times = {author : 0 for author in authors}
for article in articles:
if article.author is not None:
cited_times[article.author] += len(article.citedby)
content += "
\n".join(map(
lambda kv: "{0} – {1}".format(kv[0], kv[1]),
sorted(cited_times.items(), key=lambda t:-t[1])))
content += "