Refactor and annotate parser submodule

2020-04-21 12:52:02 -07:00 · 2020-04-21 12:52:02 -07:00 · 9e4144eccf
commit 9e4144eccf
parent d0f57c85ce
9 changed files with 243 additions and 127 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,4 @@ __pycache__/
 *.egg-info
 venv/
 .vscode
+.mypy_cache
--- a/amanuensis/parser/init.py
+++ b/amanuensis/parser/init.py
@ -1,8 +1,18 @@
 """
-Module encapsulating all markdown parsing functionality
+Module encapsulating all markdown parsing functionality.
 """

 from amanuensis.parser.analyze import FeatureCounter, GetCitations
 from amanuensis.parser.helpers import titlesort, filesafe_title
-from amanuensis.parser.tokenizer import parse_raw_markdown
-from amanuensis.parser.render import PreviewHtmlRenderer, HtmlRenderer
+from amanuensis.parser.parsing import parse_raw_markdown
+from amanuensis.parser.render import PreviewHtmlRenderer, HtmlRenderer
+
+__all__ = [
+	'FeatureCounter',
+	'GetCitations',
+	'titlesort',
+	'filesafe_title',
+	'parse_raw_markdown',
+	'PreviewHtmlRenderer',
+	'HtmlRenderer',
+]
--- a/amanuensis/parser/analyze.py
+++ b/amanuensis/parser/analyze.py
@ -5,41 +5,22 @@ for verification against constraints.

 import re

-class RenderableVisitor():
-	"""Default implementation of the visitor pattern"""
-	def TextSpan(self, span):
-		return self
-	def LineBreak(self, span):
-		return self
-	def ParsedArticle(self, span):
-		span.recurse(self)
-		return self
-	def BodyParagraph(self, span):
-		span.recurse(self)
-		return self
-	def SignatureParagraph(self, span):
-		span.recurse(self)
-		return self
-	def BoldSpan(self, span):
-		span.recurse(self)
-		return self
-	def ItalicSpan(self, span):
-		span.recurse(self)
-		return self
-	def CitationSpan(self, span):
-		span.recurse(self)
-		return self
+from amanuensis.parser.core import RenderableVisitor
+

 class GetCitations(RenderableVisitor):
 	def __init__(self):
 		self.citations = []
+
 	def ParsedArticle(self, span):
 		span.recurse(self)
 		return self.citations
+
 	def CitationSpan(self, span):
 		self.citations.append(span.cite_target)
 		return self

+
 class FeatureCounter(RenderableVisitor):
 	def __init__(self):
 		self.word_count = 0
@ -47,7 +28,7 @@ class FeatureCounter(RenderableVisitor):
 		self.has_signature = False

 	def TextSpan(self, span):
-		self.word_count += len(re.split('\s+', span.innertext.strip()))
+		self.word_count += len(re.split(r'\s+', span.innertext.strip()))
 		return self

 	def SignatureParagraph(self, span):
--- a/amanuensis/parser/core.py
+++ b/amanuensis/parser/core.py
@ -0,0 +1,135 @@
+"""
+Internal module encapsulating the core types for parsing Lexipython
+markdown. Parsed articles are represented as a hierarchy of tokens,
+which can be operated on by a visitor defining functions that hook off
+of the different token types.
+"""
+
+import re
+from typing import Callable, Any, Sequence
+
+RenderHook = Callable[['Renderable'], Any]
+Spans = Sequence['Renderable']
+
+
+def normalize_title(title: str) -> str:
+	"""
+	Normalizes strings as titles:
+	- Strips leading and trailing whitespace
+	- Merges internal whitespace into a single space
+	- Capitalizes the first word
+	"""
+	cleaned = re.sub(r'\s+', " ", title.strip())
+	return cleaned[:1].capitalize() + cleaned[1:]
+
+
+class Renderable():
+	"""
+	Base class for parsed markdown. Provides the `render()` method for
+	visiting the token tree.
+	"""
+	def render(self: 'Renderable', renderer: 'RenderableVisitor'):
+		"""
+		Execute the apppropriate visitor method on this Renderable.
+		"""
+		hook: RenderHook = getattr(renderer, type(self).__name__, None)
+		if hook:
+			return hook(self)
+		return None
+
+
+class TextSpan(Renderable):
+	"""An unstyled length of text."""
+	def __init__(self, innertext: str):
+		self.innertext = innertext
+
+	def __str__(self):
+		return f"[{self.innertext}]"
+
+
+class LineBreak(Renderable):
+	"""A line break within a paragraph."""
+	def __str__(self):
+		return "<break>"
+
+
+class SpanContainer(Renderable):
+	"""A formatting element that wraps some amount of text."""
+	def __init__(self, spans: Spans):
+		self.spans: Spans = spans
+
+	def __str__(self):
+		return (f'[{type(self).__name__} '
+			+ f'{" ".join([str(span) for span in self.spans])}]')
+
+	def recurse(self, renderer: 'RenderableVisitor'):
+		return [child.render(renderer) for child in self.spans]
+
+
+class ParsedArticle(SpanContainer):
+	"""Token tree root node, containing some number of paragraph tokens."""
+
+
+class BodyParagraph(SpanContainer):
+	"""A normal paragraph."""
+
+
+class SignatureParagraph(SpanContainer):
+	"""A paragraph preceded by a signature mark."""
+
+
+class BoldSpan(SpanContainer):
+	"""A span of text inside bold marks."""
+
+
+class ItalicSpan(SpanContainer):
+	"""A span of text inside italic marks."""
+
+
+class CitationSpan(SpanContainer):
+	"""A citation to another article."""
+	def __init__(self, spans: Spans, cite_target: str):
+		super().__init__(spans)
+		# Normalize citation target on parse, since we don't want
+		# abnormal title strings lying around causing trouble.
+		self.cite_target: str = normalize_title(cite_target)
+
+	def __str__(self):
+		return (f'{{{" ".join([str(span) for span in self.spans])}'
+			+ f':{self.cite_target}}}')
+
+
+class RenderableVisitor():
+	"""
+	Default implementation of the visitor pattern. Executes once on
+	each token in the tree and returns itself.
+	"""
+	def TextSpan(self, span: TextSpan):
+		return self
+
+	def LineBreak(self, span: LineBreak):
+		return self
+
+	def ParsedArticle(self, span: ParsedArticle):
+		span.recurse(self)
+		return self
+
+	def BodyParagraph(self, span: BodyParagraph):
+		span.recurse(self)
+		return self
+
+	def SignatureParagraph(self, span: SignatureParagraph):
+		span.recurse(self)
+		return self
+
+	def BoldSpan(self, span: BoldSpan):
+		span.recurse(self)
+		return self
+
+	def ItalicSpan(self, span: ItalicSpan):
+		span.recurse(self)
+		return self
+
+	def CitationSpan(self, span: CitationSpan):
+		span.recurse(self)
+		return self
--- a/amanuensis/parser/helpers.py
+++ b/amanuensis/parser/helpers.py
@ -1,31 +1,22 @@
 import re
-import urllib
+import urllib.parse

-def normalize_title(title):
-	"""
-	Normalizes strings as titles:
-	- Strips leading and trailing whitespace
-	- Merges internal whitespace into a single space
-	- Capitalizes the first word
-	"""
-	cleaned = re.sub(r'\s+', " ", title.strip())
-	return cleaned[:1].capitalize() + cleaned[1:]

-def titlesort(title):
+def titlesort(title: str) -> str:
 	"""
 	Strips articles off of titles for alphabetical sorting purposes
 	"""
 	lower = title.lower()
 	if lower.startswith("the "):
 		return lower[4:]
-	elif lower.startswith("an "):
+	if lower.startswith("an "):
 		return lower[3:]
-	elif lower.startswith("a "):
+	if lower.startswith("a "):
 		return lower[2:]
-	else:
-		return lower
+	return lower

-def filesafe_title(title):
+
+def filesafe_title(title: str) -> str:
 	"""
 	Makes an article title filename-safe.
 	"""
@ -34,4 +25,4 @@ def filesafe_title(title):
 	s = urllib.parse.quote(s)       # Encode all other characters
 	s = re.sub(r"%", "", s)         # Strip encoding %s
 	s = s[:64]                  	# Limit to 64 characters
-	return s
+	return s
--- a/amanuensis/parser/tokenizer.py
+++ b/amanuensis/parser/tokenizer.py
@ -1,74 +1,39 @@
 """
-Internal module encapsulating the parsing logic for Lexipython
-markdown. Parse results are represented as a hierarchy of tokens, which
-can be rendered by a renderer.
+Internal module encapsulating a recursive descent parser for
+Lexipython markdown.
 """

 import re
+from typing import Sequence

-from amanuensis.parser.helpers import normalize_title
+from amanuensis.parser.core import (
+	TextSpan,
+	LineBreak,
+	ParsedArticle,
+	BodyParagraph,
+	SignatureParagraph,
+	BoldSpan,
+	ItalicSpan,
+	CitationSpan,
+	Renderable,
+	SpanContainer
+)

-class Renderable():
-	def render(self, renderer):
-		hook = getattr(renderer, type(self).__name__, None)
-		if hook:
-			return hook(self)
-		return None
-
-class TextSpan(Renderable):
-	"""An unstyled length of text"""
-	def __init__(self, innertext):
-		self.innertext = innertext
-	def __str__(self):
-		return f"[{self.innertext}]"
-
-class LineBreak(Renderable):
-	"""A line break within a paragraph"""
-	def __str__(self):
-		return "<break>"
-
-class SpanContainer(Renderable):
-	"""A formatting element that wraps some amount of text"""
-	def __init__(self, spans):
-		self.spans = spans
-	def __str__(self):
-		return f"[{type(self).__name__} {' '.join([str(span) for span in self.spans])}]"
-	def recurse(self, renderer):
-		return [child.render(renderer) for child in self.spans]
-
-class ParsedArticle(SpanContainer):
-	"""Multiple paragraphs"""
-
-class BodyParagraph(SpanContainer):
-	"""A normal paragraph"""
-
-class SignatureParagraph(SpanContainer):
-	"""A paragraph preceded by a signature mark"""
-
-class BoldSpan(SpanContainer):
-	"""A span of text inside bold marks"""
-
-class ItalicSpan(SpanContainer):
-	"""A span of text inside italic marks"""
-
-class CitationSpan(SpanContainer):
-	"""A citation to another article"""
-	def __init__(self, spans, cite_target):
-		super().__init__(spans)
-		# Normalize citation target
-		self.cite_target = normalize_title(cite_target)
-	def __str__(self):
-		return f"{{{' '.join([str(span) for span in self.spans])}:{self.cite_target}}}"
+Spans = Sequence[Renderable]


-def parse_raw_markdown(text):
+def parse_raw_markdown(text: str) -> ParsedArticle:
+	"""
+	Parses a body of Lexipython markdown into a Renderable tree.
+	"""
 	# Parse each paragraph individually, as no formatting applies
 	# across paragraphs
 	paragraphs = re.split(r'\n\n+', text)
 	parse_results = list(map(parse_paragraph, paragraphs))
 	return ParsedArticle(parse_results)

-def parse_paragraph(text):
+
+def parse_paragraph(text: str) -> SpanContainer:
 	# Parse the paragraph as a span of text
 	text = text.strip()
 	if text and text[0] == '~':
@ -76,7 +41,12 @@ def parse_paragraph(text):
 	else:
 		return BodyParagraph(parse_paired_formatting(text))

-def parse_paired_formatting(text, cite=True, bold=True, italic=True):
+
+def parse_paired_formatting(
+		text: str,
+		cite: bool = True,
+		bold: bool = True,
+		italic: bool = True) -> Spans:
 	# Find positions of any paired formatting
 	first_cite = find_pair(text, "[[", "]]", cite)
 	first_bold = find_pair(text, "**", "**", bold)
@ -93,7 +63,12 @@ def parse_paired_formatting(text, cite=True, bold=True, italic=True):
 	first = min(finds) if finds else -1
 	return handlers[first]()

-def find_pair(text, open_tag, close_tag, valid):
+
+def find_pair(
+		text: str,
+		open_tag: str,
+		close_tag: str,
+		valid: bool) -> int:
 	# If skipping, return -1
 	if not valid:
 		return -1
@ -108,7 +83,8 @@ def find_pair(text, open_tag, close_tag, valid):
 	# Otherwise, the pair exists
 	return first

-def parse_citation(text, bold=True, italic=True):
+
+def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans:
 	cite_open = text.find("[[")
 	if cite_open > -1:
 		cite_close = text.find("]]", cite_open + 2)
@ -128,50 +104,53 @@ def parse_citation(text, bold=True, italic=True):
 		spans_inner = parse_paired_formatting(text_inner_actual,
 			cite=False, bold=bold, italic=italic)
 		citation = CitationSpan(spans_inner, cite_target)
-		return spans_before + [citation] + spans_after
+		return [*spans_before, citation, *spans_after]
 	# Should never happen
 	return parse_breaks(text)

-def parse_bold(text, cite=True, italic=True):
+
+def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans:
 	bold_open = text.find("**")
 	if bold_open > -1:
 		bold_close = text.find("**", bold_open + 2)
 		# Should be no formatting behind us
 		spans_before = parse_breaks(text[:bold_open])
 		# Freely parse formatting after us
-		spans_after = parse_paired_formatting(text[bold_close+2:])
+		spans_after = parse_paired_formatting(text[bold_close + 2:])
 		# Parse inner text minus bold parsing
-		text_inner = text[bold_open+2:bold_close]
+		text_inner = text[bold_open + 2:bold_close]
 		spans_inner = parse_paired_formatting(text_inner,
 			cite=cite, bold=False, italic=italic)
 		bold = BoldSpan(spans_inner)
-		return spans_before + [bold] + spans_after
+		return [*spans_before, bold, *spans_after]
 	# Should never happen
 	return parse_italic(text)

-def parse_italic(text, cite=True, bold=True):
+
+def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans:
 	italic_open = text.find("//")
 	if italic_open > -1:
 		italic_close = text.find("//", italic_open + 2)
 		# Should be no formatting behind us
 		spans_before = parse_breaks(text[:italic_open])
 		# Freely parse formatting after us
-		spans_after = parse_paired_formatting(text[italic_close+2:])
+		spans_after = parse_paired_formatting(text[italic_close + 2:])
 		# Parse inner text minus italic parsing
-		text_inner = text[italic_open+2:italic_close]
+		text_inner = text[italic_open + 2:italic_close]
 		spans_inner = parse_paired_formatting(text_inner,
 			cite=cite, bold=bold, italic=False)
 		italic = ItalicSpan(spans_inner)
-		return spans_before + [italic] + spans_after
+		return [*spans_before, italic, *spans_after]
 	# Should never happen
 	return parse_breaks(text)

-def parse_breaks(text):
+
+def parse_breaks(text: str) -> Spans:
 	if not text:
 		return []
-	splits = list(map(TextSpan, text.split("\\\\\n")))
-	spans = [splits[0]]
-	for span in splits[1:]:
-		spans.append(LineBreak())
-		spans.append(span)
+	splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
+	spans: Spans = [
+		splits[i // 2] if i % 2 == 0 else LineBreak()
+		for i in range(0, 2 * len(splits) - 1)
+	]
 	return spans
--- a/amanuensis/parser/render.py
+++ b/amanuensis/parser/render.py
@ -3,7 +3,7 @@ Internal module encapsulating visitors that render articles into
 readable formats.
 """

-from flask import url_for
+from typing import Iterable

 from amanuensis.parser.helpers import filesafe_title

@ -12,9 +12,9 @@ class HtmlRenderer():
 	"""
 	Renders an article token tree into published article HTML.
 	"""
-	def __init__(self, lexicon_name, written_articles):
-		self.lexicon_name = lexicon_name
-		self.written_articles = written_articles
+	def __init__(self, lexicon_name: str, written_articles: Iterable[str]):
+		self.lexicon_name: str = lexicon_name
+		self.written_articles: Iterable[str] = written_articles

 	def TextSpan(self, span):
 		return span.innertext
@ -50,11 +50,11 @@ class HtmlRenderer():
 		# 	'lexicon.article',
 		# 	name=self.lexicon_name,
 		# 	title=filesafe_title(span.cite_target))
-		link = f'/lexicon/{self.lexicon_name}/article/{filesafe_title(span.cite_target)}'
+		link = (f'/lexicon/{self.lexicon_name}'
+			+ f'/article/{filesafe_title(span.cite_target)}')
 		return f'<a href="{link}"{link_class}>{"".join(span.recurse(self))}</a>'


-
 class PreviewHtmlRenderer():
 	def __init__(self, lexicon):
 		with lexicon.ctx.read('info') as info:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,7 @@
 astroid==2.3.3
 Click==7.0
+entrypoints==0.3
+flake8==3.7.9
 Flask==1.1.1
 Flask-Login==0.4.1
 Flask-WTF==0.14.2
@ -9,10 +11,14 @@ Jinja2==2.10.3
 lazy-object-proxy==1.4.3
 MarkupSafe==1.1.1
 mccabe==0.6.1
+mypy==0.770
+mypy-extensions==0.4.3
 pkg-resources==0.0.0
-pylint==2.4.4
+pycodestyle==2.5.0
+pyflakes==2.1.1
 six==1.14.0
 typed-ast==1.4.1
+typing-extensions==3.7.4.2
 Werkzeug==0.16.0
 wrapt==1.11.2
 WTForms==2.2.1
--- a/tox.ini
+++ b/tox.ini
@ -0,0 +1,13 @@
+[flake8]
+ignore =
+	W191 # we use tabs here
+	W503 # \n before binary op
+	E117 # broken for tabs
+	E126 # tabs
+	E128 # tabs
+exclude =
+	.git
+	__pycache__
+
+[mypy]
+ignore_missing_imports = True