Reorganize parser and style pass

2021-06-09 15:57:48 -07:00 · 2021-06-09 15:57:48 -07:00 · 1c55d866a8
commit 1c55d866a8
parent ffa27be86d
6 changed files with 280 additions and 234 deletions
--- a/amanuensis/parser/init.py
+++ b/amanuensis/parser/init.py
@ -2,13 +2,14 @@
 Module encapsulating all markdown parsing functionality.
 """
-from .core import normalize_title
+from .core import RenderableVisitor
-from .helpers import titlesort, filesafe_title
+from .helpers import normalize_title, filesafe_title, titlesort
 from .parsing import parse_raw_markdown
 __all__ = [
-	normalize_title.__name__,
+    "RenderableVisitor",
-	titlesort.__name__,
+    "normalize_title",
-	filesafe_title.__name__,
+    "filesafe_title",
-	parse_raw_markdown.__name__,
+    "titlesort",
    "parse_raw_markdown",
 ]
--- a/amanuensis/parser/core.py
+++ b/amanuensis/parser/core.py
@ -5,131 +5,134 @@ which can be operated on by a visitor defining functions that hook off
 of the different token types.
 """
 import re
 from typing import Callable, Any, Sequence
-RenderHook = Callable[['Renderable'], Any]
+from .helpers import normalize_title
 Spans = Sequence['Renderable']
-def normalize_title(title: str) -> str:
+RenderHook = Callable[["Renderable"], Any]
-	"""
+Spans = Sequence["Renderable"]
 	Normalizes strings as titles:
 	- Strips leading and trailing whitespace
 	- Merges internal whitespace into a single space
 	- Capitalizes the first word
 	"""
 	cleaned = re.sub(r'\s+', " ", title.strip())
 	return cleaned[:1].capitalize() + cleaned[1:]
-class Renderable():
+class Renderable:
-	"""
+    """
-	Base class for parsed markdown. Provides the `render()` method for
+    Base class for parsed markdown. Provides the `render()` method for
-	visiting the token tree.
+    visiting the token tree.
-	"""
+    """
-	def render(self: 'Renderable', renderer: 'RenderableVisitor'):
+
-		"""
+    def render(self: "Renderable", renderer: "RenderableVisitor"):
-		Execute the apppropriate visitor method on this Renderable.
+        """
-		"""
+        Execute the apppropriate visitor method on this Renderable.
-		hook: RenderHook = getattr(renderer, type(self).__name__, None)
+        Visitors implement hooks by declaring methods whose names are
-		if hook:
+        the name of a Renderable class.
-			return hook(self)
+        """
-		return None
+        hook: RenderHook = getattr(renderer, type(self).__name__, None)
        if hook:
            return hook(self)
        return None
 class TextSpan(Renderable):
-	"""An unstyled length of text."""
+    """A length of text."""
 	def __init__(self, innertext: str):
 		self.innertext = innertext
-	def __str__(self):
+    def __init__(self, innertext: str):
-		return f"[{self.innertext}]"
+        self.innertext = innertext
    def __str__(self):
        return f"[{self.innertext}]"
 class LineBreak(Renderable):
-	"""A line break within a paragraph."""
+    """A line break within a paragraph."""
-	def __str__(self):
+
-		return "<break>"
+    def __str__(self):
        return "<break>"
 class SpanContainer(Renderable):
-	"""A formatting element that wraps some amount of text."""
+    """A formatting element that wraps some amount of text."""
 	def __init__(self, spans: Spans):
 		self.spans: Spans = spans
-	def __str__(self):
+    def __init__(self, spans: Spans):
-		return (f'[{type(self).__name__} '
+        self.spans: Spans = spans
 			+ f'{" ".join([str(span) for span in self.spans])}]')
-	def recurse(self, renderer: 'RenderableVisitor'):
+    def __str__(self):
-		return [child.render(renderer) for child in self.spans]
+        return (
            f"[{type(self).__name__} "
            + f'{" ".join([str(span) for span in self.spans])}]'
        )
    def recurse(self, renderer: "RenderableVisitor"):
        return [child.render(renderer) for child in self.spans]
 class ParsedArticle(SpanContainer):
-	"""Token tree root node, containing some number of paragraph tokens."""
+    """Token tree root node, containing some number of paragraph tokens."""
 class BodyParagraph(SpanContainer):
-	"""A normal paragraph."""
+    """A normal paragraph."""
 class SignatureParagraph(SpanContainer):
-	"""A paragraph preceded by a signature mark."""
+    """A paragraph preceded by a signature mark."""
 class BoldSpan(SpanContainer):
-	"""A span of text inside bold marks."""
+    """A span of text inside bold marks."""
 class ItalicSpan(SpanContainer):
-	"""A span of text inside italic marks."""
+    """A span of text inside italic marks."""
 class CitationSpan(SpanContainer):
-	"""A citation to another article."""
+    """A citation to another article."""
 	def __init__(self, spans: Spans, cite_target: str):
 		super().__init__(spans)
 		# Normalize citation target on parse, since we don't want
 		# abnormal title strings lying around causing trouble.
 		self.cite_target: str = normalize_title(cite_target)
-	def __str__(self):
+    def __init__(self, spans: Spans, cite_target: str):
-		return (f'{{{" ".join([str(span) for span in self.spans])}'
+        super().__init__(spans)
-			+ f':{self.cite_target}}}')
+        # Normalize citation target on parse, since we don't want
        # abnormal title strings lying around causing trouble.
        self.cite_target: str = normalize_title(cite_target)
    def __str__(self) -> str:
        return (
            f'{{{" ".join([str(span) for span in self.spans])}'
            + f":{self.cite_target}}}"
        )
-class RenderableVisitor():
+class RenderableVisitor:
-	"""
+    """
-	Default implementation of the visitor pattern. Executes once on
+    Default implementation of the visitor pattern. Executes once on
-	each token in the tree and returns itself.
+    each token in the tree and returns itself.
-	"""
+    """
 	def TextSpan(self, span: TextSpan):
 		return self
-	def LineBreak(self, span: LineBreak):
+    def TextSpan(self, span: TextSpan):
-		return self
+        return self
-	def ParsedArticle(self, span: ParsedArticle):
+    def LineBreak(self, span: LineBreak):
-		span.recurse(self)
+        return self
 		return self
-	def BodyParagraph(self, span: BodyParagraph):
+    def ParsedArticle(self, span: ParsedArticle):
-		span.recurse(self)
+        span.recurse(self)
-		return self
+        return self
-	def SignatureParagraph(self, span: SignatureParagraph):
+    def BodyParagraph(self, span: BodyParagraph):
-		span.recurse(self)
+        span.recurse(self)
-		return self
+        return self
-	def BoldSpan(self, span: BoldSpan):
+    def SignatureParagraph(self, span: SignatureParagraph):
-		span.recurse(self)
+        span.recurse(self)
-		return self
+        return self
-	def ItalicSpan(self, span: ItalicSpan):
+    def BoldSpan(self, span: BoldSpan):
-		span.recurse(self)
+        span.recurse(self)
-		return self
+        return self
-	def CitationSpan(self, span: CitationSpan):
+    def ItalicSpan(self, span: ItalicSpan):
-		span.recurse(self)
+        span.recurse(self)
-		return self
+        return self
    def CitationSpan(self, span: CitationSpan):
        span.recurse(self)
        return self
--- a/amanuensis/parser/helpers.py
+++ b/amanuensis/parser/helpers.py
@ -1,28 +1,53 @@
 """
 Helper functions for manipulating titles during parsing
 """
 import re
 import urllib.parse
 def normalize_title(title: str) -> str:
    """
    Normalizes strings as titles:
    - Strips leading and trailing whitespace
    - Merges internal whitespace into a single space
    - Capitalizes the first word
    """
    cleaned = re.sub(r"\s+", " ", title.strip())
    return cleaned[:1].capitalize() + cleaned[1:]
 def titlesort(title: str) -> str:
-	"""
+    """
-	Strips articles off of titles for alphabetical sorting purposes
+    Strips articles off of titles for alphabetical sorting purposes
-	"""
+    """
-	lower = title.lower()
+    lower = title.lower()
-	if lower.startswith("the "):
+    if lower.startswith("the "):
-		return lower[4:]
+        return lower[4:]
-	if lower.startswith("an "):
+    if lower.startswith("an "):
-		return lower[3:]
+        return lower[3:]
-	if lower.startswith("a "):
+    if lower.startswith("a "):
-		return lower[2:]
+        return lower[2:]
-	return lower
+    return lower
 def filesafe_title(title: str) -> str:
-	"""
+    """
-	Makes an article title filename-safe.
+    Makes an article title filename-safe.
-	"""
+    """
-	s = re.sub(r"\s+", '_', title)  # Replace whitespace with _
+    # Replace whitespace with _
-	s = re.sub(r"~", '-', s)        # parse.quote doesn't catch ~
+    s = re.sub(r"\s+", "_", title)
-	s = urllib.parse.quote(s)       # Encode all other characters
+
-	s = re.sub(r"%", "", s)         # Strip encoding %s
+    # parse.quote doesn't catch ~
-	s = s[:64]                  	# Limit to 64 characters
+    s = re.sub(r"~", "-", s)
-	return s
+
    # Encode all other characters
    s = urllib.parse.quote(s)
    # Strip encoding %s
    s = re.sub(r"%", "", s)
    # Limit to 64 characters
    s = s[:64]
    return s
--- a/amanuensis/parser/parsing.py
+++ b/amanuensis/parser/parsing.py
@ -7,150 +7,167 @@ import re
 from typing import Sequence
 from .core import (
-	TextSpan,
+    TextSpan,
-	LineBreak,
+    LineBreak,
-	ParsedArticle,
+    ParsedArticle,
-	BodyParagraph,
+    BodyParagraph,
-	SignatureParagraph,
+    SignatureParagraph,
-	BoldSpan,
+    BoldSpan,
-	ItalicSpan,
+    ItalicSpan,
-	CitationSpan,
+    CitationSpan,
-	Renderable,
+    Renderable,
-	SpanContainer
+    SpanContainer,
 )
 Spans = Sequence[Renderable]
 def parse_raw_markdown(text: str) -> ParsedArticle:
-	"""
+    """
-	Parses a body of Lexipython markdown into a Renderable tree.
+    Parses a body of Lexipython markdown into a Renderable tree.
-	"""
+    """
-	# Parse each paragraph individually, as no formatting applies
+    # Parse each paragraph individually, as no formatting applies
-	# across paragraphs
+    # across paragraphs
-	paragraphs = re.split(r'\n\n+', text)
+    paragraphs = re.split(r"\n\n+", text)
-	parse_results = list(map(parse_paragraph, paragraphs))
+    parse_results = list(map(parse_paragraph, paragraphs))
-	return ParsedArticle(parse_results)
+    return ParsedArticle(parse_results)
 def parse_paragraph(text: str) -> SpanContainer:
-	# Parse the paragraph as a span of text
+    # Parse the paragraph as a span of text
-	text = text.strip()
+    text = text.strip()
-	if text and text[0] == '~':
+    if text and text[0] == "~":
-		return SignatureParagraph(parse_paired_formatting(text[1:]))
+        return SignatureParagraph(parse_paired_formatting(text[1:]))
-	else:
+    else:
-		return BodyParagraph(parse_paired_formatting(text))
+        return BodyParagraph(parse_paired_formatting(text))
 def parse_paired_formatting(
-		text: str,
+    text: str,
-		cite: bool = True,
+    cite: bool = True,
-		bold: bool = True,
+    bold: bool = True,
-		italic: bool = True) -> Spans:
+    italic: bool = True,
-	# Find positions of any paired formatting
+) -> Spans:
-	first_cite = find_pair(text, "[[", "]]", cite)
+    # Find positions of any paired formatting
-	first_bold = find_pair(text, "**", "**", bold)
+    first_cite = find_pair(text, "[[", "]]", cite)
-	first_italic = find_pair(text, "//", "//", italic)
+    first_bold = find_pair(text, "**", "**", bold)
-	# Load the possible parse handlers into the map
+    first_italic = find_pair(text, "//", "//", italic)
-	handlers = {}
+    # Load the possible parse handlers into the map
-	handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic)
+    handlers = {}
-	handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic)
+    handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic)
-	handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold)
+    handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic)
-	# If nothing was found, move on to the next parsing step
+    handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold)
-	handlers[-1] = lambda: parse_breaks(text)
+    # If nothing was found, move on to the next parsing step
-	# Choose a handler based on the earliest found result
+    handlers[-1] = lambda: parse_breaks(text)
-	finds = [i for i in (first_cite, first_bold, first_italic) if i > -1]
+    # Choose a handler based on the earliest found result
-	first = min(finds) if finds else -1
+    finds = [i for i in (first_cite, first_bold, first_italic) if i > -1]
-	return handlers[first]()
+    first = min(finds) if finds else -1
    return handlers[first]()
 def find_pair(
-		text: str,
+    text: str,
-		open_tag: str,
+    open_tag: str,
-		close_tag: str,
+    close_tag: str,
-		valid: bool) -> int:
+    valid: bool,
-	# If skipping, return -1
+) -> int:
-	if not valid:
+    # If skipping, return -1
-		return -1
+    if not valid:
-	# If the open tag wasn't found, return -1
+        return -1
-	first = text.find(open_tag)
+    # If the open tag wasn't found, return -1
-	if first < 0:
+    first = text.find(open_tag)
-		return -1
+    if first < 0:
-	# If the close tag wasn't found after the open tag, return -1
+        return -1
-	second = text.find(close_tag, first + len(open_tag))
+    # If the close tag wasn't found after the open tag, return -1
-	if second < 0:
+    second = text.find(close_tag, first + len(open_tag))
-		return -1
+    if second < 0:
-	# Otherwise, the pair exists
+        return -1
-	return first
+    # Otherwise, the pair exists
    return first
-def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans:
+def parse_citation(
-	cite_open = text.find("[[")
+    text: str,
-	if cite_open > -1:
+    bold: bool = True,
-		cite_close = text.find("]]", cite_open + 2)
+    italic: bool = True,
-		# Since we searched for pairs from the beginning, there should be no
+) -> Spans:
-		# undetected pair formatting before this one, so move to the next
+    cite_open = text.find("[[")
-		# level of parsing
+    if cite_open > -1:
-		spans_before = parse_breaks(text[:cite_open])
+        cite_close = text.find("]]", cite_open + 2)
-		# Continue parsing pair formatting after this one closes with all
+        # Since we searched for pairs from the beginning, there should be no
-		# three as valid choices
+        # undetected pair formatting before this one, so move to the next
-		spans_after = parse_paired_formatting(text[cite_close + 2:])
+        # level of parsing
-		# Parse inner text and skip parsing for this format pair
+        spans_before = parse_breaks(text[:cite_open])
-		text_inner = text[cite_open + 2:cite_close]
+        # Continue parsing pair formatting after this one closes with all
-		# For citations specifically, we may need to split off a citation
+        # three as valid choices
-		# target from the alias text
+        spans_after = parse_paired_formatting(text[cite_close + 2 :])
-		inner_split = text_inner.split("|", 1)
+        # Parse inner text and skip parsing for this format pair
-		text_inner_actual, cite_target = inner_split[0], inner_split[-1]
+        text_inner = text[cite_open + 2 : cite_close]
-		spans_inner = parse_paired_formatting(text_inner_actual,
+        # For citations specifically, we may need to split off a citation
-			cite=False, bold=bold, italic=italic)
+        # target from the alias text
-		citation = CitationSpan(spans_inner, cite_target)
+        inner_split = text_inner.split("|", 1)
-		return [*spans_before, citation, *spans_after]
+        text_inner_actual, cite_target = inner_split[0], inner_split[-1]
-	# Should never happen
+        spans_inner = parse_paired_formatting(
-	return parse_breaks(text)
+            text_inner_actual, cite=False, bold=bold, italic=italic
        )
        citation = CitationSpan(spans_inner, cite_target)
        return [*spans_before, citation, *spans_after]
    # Should never happen
    return parse_breaks(text)
-def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans:
+def parse_bold(
-	bold_open = text.find("**")
+    text: str,
-	if bold_open > -1:
+    cite: bool = True,
-		bold_close = text.find("**", bold_open + 2)
+    italic: bool = True,
-		# Should be no formatting behind us
+) -> Spans:
-		spans_before = parse_breaks(text[:bold_open])
+    bold_open = text.find("**")
-		# Freely parse formatting after us
+    if bold_open > -1:
-		spans_after = parse_paired_formatting(text[bold_close + 2:])
+        bold_close = text.find("**", bold_open + 2)
-		# Parse inner text minus bold parsing
+        # Should be no formatting behind us
-		text_inner = text[bold_open + 2:bold_close]
+        spans_before = parse_breaks(text[:bold_open])
-		spans_inner = parse_paired_formatting(text_inner,
+        # Freely parse formatting after us
-			cite=cite, bold=False, italic=italic)
+        spans_after = parse_paired_formatting(text[bold_close + 2 :])
-		bold = BoldSpan(spans_inner)
+        # Parse inner text minus bold parsing
-		return [*spans_before, bold, *spans_after]
+        text_inner = text[bold_open + 2 : bold_close]
-	# Should never happen
+        spans_inner = parse_paired_formatting(
-	return parse_italic(text)
+            text_inner, cite=cite, bold=False, italic=italic
        )
        bold = BoldSpan(spans_inner)
        return [*spans_before, bold, *spans_after]
    # Should never happen
    return parse_italic(text)
-def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans:
+def parse_italic(
-	italic_open = text.find("//")
+    text: str,
-	if italic_open > -1:
+    cite: bool = True,
-		italic_close = text.find("//", italic_open + 2)
+    bold: bool = True,
-		# Should be no formatting behind us
+) -> Spans:
-		spans_before = parse_breaks(text[:italic_open])
+    italic_open = text.find("//")
-		# Freely parse formatting after us
+    if italic_open > -1:
-		spans_after = parse_paired_formatting(text[italic_close + 2:])
+        italic_close = text.find("//", italic_open + 2)
-		# Parse inner text minus italic parsing
+        # Should be no formatting behind us
-		text_inner = text[italic_open + 2:italic_close]
+        spans_before = parse_breaks(text[:italic_open])
-		spans_inner = parse_paired_formatting(text_inner,
+        # Freely parse formatting after us
-			cite=cite, bold=bold, italic=False)
+        spans_after = parse_paired_formatting(text[italic_close + 2 :])
-		italic = ItalicSpan(spans_inner)
+        # Parse inner text minus italic parsing
-		return [*spans_before, italic, *spans_after]
+        text_inner = text[italic_open + 2 : italic_close]
-	# Should never happen
+        spans_inner = parse_paired_formatting(
-	return parse_breaks(text)
+            text_inner, cite=cite, bold=bold, italic=False
        )
        italic = ItalicSpan(spans_inner)
        return [*spans_before, italic, *spans_after]
    # Should never happen
    return parse_breaks(text)
 def parse_breaks(text: str) -> Spans:
-	if not text:
+    if not text:
-		return []
+        return []
-	splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
+    splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
-	spans: Spans = [
+    spans: Spans = [
-		splits[i // 2] if i % 2 == 0 else LineBreak()
+        splits[i // 2] if i % 2 == 0 else LineBreak()
-		for i in range(0, 2 * len(splits) - 1)
+        for i in range(0, 2 * len(splits) - 1)
-	]
+    ]
-	return spans
+    return spans
--- a/mypy.ini
+++ b/mypy.ini
@ -1,4 +1,4 @@
 [mypy]
 ignore_missing_imports = true
-exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
+exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
 ; mypy stable doesn't support pyproject.toml yet
--- a/pyproject.toml
+++ b/pyproject.toml
@ -17,11 +17,11 @@ black = "^21.5b2"
 mypy = "^0.812"
 [tool.black]
-extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/parser/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
+extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
 [tool.mypy]
 ignore_missing_imports = true
-exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
+exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
 [tool.pytest.ini_options]
 addopts = "--show-capture=log"