From 1c55d866a8054cc5627a342f453046a1fd7f007e Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 9 Jun 2021 15:57:48 -0700 Subject: [PATCH] Reorganize parser and style pass --- amanuensis/parser/__init__.py | 13 +- amanuensis/parser/core.py | 167 ++++++++++----------- amanuensis/parser/helpers.py | 65 ++++++--- amanuensis/parser/parsing.py | 263 ++++++++++++++++++---------------- mypy.ini | 2 +- pyproject.toml | 4 +- 6 files changed, 280 insertions(+), 234 deletions(-) diff --git a/amanuensis/parser/__init__.py b/amanuensis/parser/__init__.py index aff1bd4..7aa5bd7 100644 --- a/amanuensis/parser/__init__.py +++ b/amanuensis/parser/__init__.py @@ -2,13 +2,14 @@ Module encapsulating all markdown parsing functionality. """ -from .core import normalize_title -from .helpers import titlesort, filesafe_title +from .core import RenderableVisitor +from .helpers import normalize_title, filesafe_title, titlesort from .parsing import parse_raw_markdown __all__ = [ - normalize_title.__name__, - titlesort.__name__, - filesafe_title.__name__, - parse_raw_markdown.__name__, + "RenderableVisitor", + "normalize_title", + "filesafe_title", + "titlesort", + "parse_raw_markdown", ] diff --git a/amanuensis/parser/core.py b/amanuensis/parser/core.py index 76f15de..d50049a 100644 --- a/amanuensis/parser/core.py +++ b/amanuensis/parser/core.py @@ -5,131 +5,134 @@ which can be operated on by a visitor defining functions that hook off of the different token types. """ -import re from typing import Callable, Any, Sequence -RenderHook = Callable[['Renderable'], Any] -Spans = Sequence['Renderable'] +from .helpers import normalize_title -def normalize_title(title: str) -> str: - """ - Normalizes strings as titles: - - Strips leading and trailing whitespace - - Merges internal whitespace into a single space - - Capitalizes the first word - """ - cleaned = re.sub(r'\s+', " ", title.strip()) - return cleaned[:1].capitalize() + cleaned[1:] +RenderHook = Callable[["Renderable"], Any] +Spans = Sequence["Renderable"] -class Renderable(): - """ - Base class for parsed markdown. Provides the `render()` method for - visiting the token tree. - """ - def render(self: 'Renderable', renderer: 'RenderableVisitor'): - """ - Execute the apppropriate visitor method on this Renderable. - """ - hook: RenderHook = getattr(renderer, type(self).__name__, None) - if hook: - return hook(self) - return None +class Renderable: + """ + Base class for parsed markdown. Provides the `render()` method for + visiting the token tree. + """ + + def render(self: "Renderable", renderer: "RenderableVisitor"): + """ + Execute the apppropriate visitor method on this Renderable. + Visitors implement hooks by declaring methods whose names are + the name of a Renderable class. + """ + hook: RenderHook = getattr(renderer, type(self).__name__, None) + if hook: + return hook(self) + return None class TextSpan(Renderable): - """An unstyled length of text.""" - def __init__(self, innertext: str): - self.innertext = innertext + """A length of text.""" - def __str__(self): - return f"[{self.innertext}]" + def __init__(self, innertext: str): + self.innertext = innertext + + def __str__(self): + return f"[{self.innertext}]" class LineBreak(Renderable): - """A line break within a paragraph.""" - def __str__(self): - return "" + """A line break within a paragraph.""" + + def __str__(self): + return "" class SpanContainer(Renderable): - """A formatting element that wraps some amount of text.""" - def __init__(self, spans: Spans): - self.spans: Spans = spans + """A formatting element that wraps some amount of text.""" - def __str__(self): - return (f'[{type(self).__name__} ' - + f'{" ".join([str(span) for span in self.spans])}]') + def __init__(self, spans: Spans): + self.spans: Spans = spans - def recurse(self, renderer: 'RenderableVisitor'): - return [child.render(renderer) for child in self.spans] + def __str__(self): + return ( + f"[{type(self).__name__} " + + f'{" ".join([str(span) for span in self.spans])}]' + ) + + def recurse(self, renderer: "RenderableVisitor"): + return [child.render(renderer) for child in self.spans] class ParsedArticle(SpanContainer): - """Token tree root node, containing some number of paragraph tokens.""" + """Token tree root node, containing some number of paragraph tokens.""" class BodyParagraph(SpanContainer): - """A normal paragraph.""" + """A normal paragraph.""" class SignatureParagraph(SpanContainer): - """A paragraph preceded by a signature mark.""" + """A paragraph preceded by a signature mark.""" class BoldSpan(SpanContainer): - """A span of text inside bold marks.""" + """A span of text inside bold marks.""" class ItalicSpan(SpanContainer): - """A span of text inside italic marks.""" + """A span of text inside italic marks.""" class CitationSpan(SpanContainer): - """A citation to another article.""" - def __init__(self, spans: Spans, cite_target: str): - super().__init__(spans) - # Normalize citation target on parse, since we don't want - # abnormal title strings lying around causing trouble. - self.cite_target: str = normalize_title(cite_target) + """A citation to another article.""" - def __str__(self): - return (f'{{{" ".join([str(span) for span in self.spans])}' - + f':{self.cite_target}}}') + def __init__(self, spans: Spans, cite_target: str): + super().__init__(spans) + # Normalize citation target on parse, since we don't want + # abnormal title strings lying around causing trouble. + self.cite_target: str = normalize_title(cite_target) + + def __str__(self) -> str: + return ( + f'{{{" ".join([str(span) for span in self.spans])}' + + f":{self.cite_target}}}" + ) -class RenderableVisitor(): - """ - Default implementation of the visitor pattern. Executes once on - each token in the tree and returns itself. - """ - def TextSpan(self, span: TextSpan): - return self +class RenderableVisitor: + """ + Default implementation of the visitor pattern. Executes once on + each token in the tree and returns itself. + """ - def LineBreak(self, span: LineBreak): - return self + def TextSpan(self, span: TextSpan): + return self - def ParsedArticle(self, span: ParsedArticle): - span.recurse(self) - return self + def LineBreak(self, span: LineBreak): + return self - def BodyParagraph(self, span: BodyParagraph): - span.recurse(self) - return self + def ParsedArticle(self, span: ParsedArticle): + span.recurse(self) + return self - def SignatureParagraph(self, span: SignatureParagraph): - span.recurse(self) - return self + def BodyParagraph(self, span: BodyParagraph): + span.recurse(self) + return self - def BoldSpan(self, span: BoldSpan): - span.recurse(self) - return self + def SignatureParagraph(self, span: SignatureParagraph): + span.recurse(self) + return self - def ItalicSpan(self, span: ItalicSpan): - span.recurse(self) - return self + def BoldSpan(self, span: BoldSpan): + span.recurse(self) + return self - def CitationSpan(self, span: CitationSpan): - span.recurse(self) - return self + def ItalicSpan(self, span: ItalicSpan): + span.recurse(self) + return self + + def CitationSpan(self, span: CitationSpan): + span.recurse(self) + return self diff --git a/amanuensis/parser/helpers.py b/amanuensis/parser/helpers.py index e6eabfb..7f2123d 100644 --- a/amanuensis/parser/helpers.py +++ b/amanuensis/parser/helpers.py @@ -1,28 +1,53 @@ +""" +Helper functions for manipulating titles during parsing +""" + import re import urllib.parse +def normalize_title(title: str) -> str: + """ + Normalizes strings as titles: + - Strips leading and trailing whitespace + - Merges internal whitespace into a single space + - Capitalizes the first word + """ + cleaned = re.sub(r"\s+", " ", title.strip()) + return cleaned[:1].capitalize() + cleaned[1:] + + def titlesort(title: str) -> str: - """ - Strips articles off of titles for alphabetical sorting purposes - """ - lower = title.lower() - if lower.startswith("the "): - return lower[4:] - if lower.startswith("an "): - return lower[3:] - if lower.startswith("a "): - return lower[2:] - return lower + """ + Strips articles off of titles for alphabetical sorting purposes + """ + lower = title.lower() + if lower.startswith("the "): + return lower[4:] + if lower.startswith("an "): + return lower[3:] + if lower.startswith("a "): + return lower[2:] + return lower def filesafe_title(title: str) -> str: - """ - Makes an article title filename-safe. - """ - s = re.sub(r"\s+", '_', title) # Replace whitespace with _ - s = re.sub(r"~", '-', s) # parse.quote doesn't catch ~ - s = urllib.parse.quote(s) # Encode all other characters - s = re.sub(r"%", "", s) # Strip encoding %s - s = s[:64] # Limit to 64 characters - return s + """ + Makes an article title filename-safe. + """ + # Replace whitespace with _ + s = re.sub(r"\s+", "_", title) + + # parse.quote doesn't catch ~ + s = re.sub(r"~", "-", s) + + # Encode all other characters + s = urllib.parse.quote(s) + + # Strip encoding %s + s = re.sub(r"%", "", s) + + # Limit to 64 characters + s = s[:64] + + return s diff --git a/amanuensis/parser/parsing.py b/amanuensis/parser/parsing.py index 4e0695f..c6bb50b 100644 --- a/amanuensis/parser/parsing.py +++ b/amanuensis/parser/parsing.py @@ -7,150 +7,167 @@ import re from typing import Sequence from .core import ( - TextSpan, - LineBreak, - ParsedArticle, - BodyParagraph, - SignatureParagraph, - BoldSpan, - ItalicSpan, - CitationSpan, - Renderable, - SpanContainer + TextSpan, + LineBreak, + ParsedArticle, + BodyParagraph, + SignatureParagraph, + BoldSpan, + ItalicSpan, + CitationSpan, + Renderable, + SpanContainer, ) Spans = Sequence[Renderable] def parse_raw_markdown(text: str) -> ParsedArticle: - """ - Parses a body of Lexipython markdown into a Renderable tree. - """ - # Parse each paragraph individually, as no formatting applies - # across paragraphs - paragraphs = re.split(r'\n\n+', text) - parse_results = list(map(parse_paragraph, paragraphs)) - return ParsedArticle(parse_results) + """ + Parses a body of Lexipython markdown into a Renderable tree. + """ + # Parse each paragraph individually, as no formatting applies + # across paragraphs + paragraphs = re.split(r"\n\n+", text) + parse_results = list(map(parse_paragraph, paragraphs)) + return ParsedArticle(parse_results) def parse_paragraph(text: str) -> SpanContainer: - # Parse the paragraph as a span of text - text = text.strip() - if text and text[0] == '~': - return SignatureParagraph(parse_paired_formatting(text[1:])) - else: - return BodyParagraph(parse_paired_formatting(text)) + # Parse the paragraph as a span of text + text = text.strip() + if text and text[0] == "~": + return SignatureParagraph(parse_paired_formatting(text[1:])) + else: + return BodyParagraph(parse_paired_formatting(text)) def parse_paired_formatting( - text: str, - cite: bool = True, - bold: bool = True, - italic: bool = True) -> Spans: - # Find positions of any paired formatting - first_cite = find_pair(text, "[[", "]]", cite) - first_bold = find_pair(text, "**", "**", bold) - first_italic = find_pair(text, "//", "//", italic) - # Load the possible parse handlers into the map - handlers = {} - handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic) - handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic) - handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold) - # If nothing was found, move on to the next parsing step - handlers[-1] = lambda: parse_breaks(text) - # Choose a handler based on the earliest found result - finds = [i for i in (first_cite, first_bold, first_italic) if i > -1] - first = min(finds) if finds else -1 - return handlers[first]() + text: str, + cite: bool = True, + bold: bool = True, + italic: bool = True, +) -> Spans: + # Find positions of any paired formatting + first_cite = find_pair(text, "[[", "]]", cite) + first_bold = find_pair(text, "**", "**", bold) + first_italic = find_pair(text, "//", "//", italic) + # Load the possible parse handlers into the map + handlers = {} + handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic) + handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic) + handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold) + # If nothing was found, move on to the next parsing step + handlers[-1] = lambda: parse_breaks(text) + # Choose a handler based on the earliest found result + finds = [i for i in (first_cite, first_bold, first_italic) if i > -1] + first = min(finds) if finds else -1 + return handlers[first]() def find_pair( - text: str, - open_tag: str, - close_tag: str, - valid: bool) -> int: - # If skipping, return -1 - if not valid: - return -1 - # If the open tag wasn't found, return -1 - first = text.find(open_tag) - if first < 0: - return -1 - # If the close tag wasn't found after the open tag, return -1 - second = text.find(close_tag, first + len(open_tag)) - if second < 0: - return -1 - # Otherwise, the pair exists - return first + text: str, + open_tag: str, + close_tag: str, + valid: bool, +) -> int: + # If skipping, return -1 + if not valid: + return -1 + # If the open tag wasn't found, return -1 + first = text.find(open_tag) + if first < 0: + return -1 + # If the close tag wasn't found after the open tag, return -1 + second = text.find(close_tag, first + len(open_tag)) + if second < 0: + return -1 + # Otherwise, the pair exists + return first -def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans: - cite_open = text.find("[[") - if cite_open > -1: - cite_close = text.find("]]", cite_open + 2) - # Since we searched for pairs from the beginning, there should be no - # undetected pair formatting before this one, so move to the next - # level of parsing - spans_before = parse_breaks(text[:cite_open]) - # Continue parsing pair formatting after this one closes with all - # three as valid choices - spans_after = parse_paired_formatting(text[cite_close + 2:]) - # Parse inner text and skip parsing for this format pair - text_inner = text[cite_open + 2:cite_close] - # For citations specifically, we may need to split off a citation - # target from the alias text - inner_split = text_inner.split("|", 1) - text_inner_actual, cite_target = inner_split[0], inner_split[-1] - spans_inner = parse_paired_formatting(text_inner_actual, - cite=False, bold=bold, italic=italic) - citation = CitationSpan(spans_inner, cite_target) - return [*spans_before, citation, *spans_after] - # Should never happen - return parse_breaks(text) +def parse_citation( + text: str, + bold: bool = True, + italic: bool = True, +) -> Spans: + cite_open = text.find("[[") + if cite_open > -1: + cite_close = text.find("]]", cite_open + 2) + # Since we searched for pairs from the beginning, there should be no + # undetected pair formatting before this one, so move to the next + # level of parsing + spans_before = parse_breaks(text[:cite_open]) + # Continue parsing pair formatting after this one closes with all + # three as valid choices + spans_after = parse_paired_formatting(text[cite_close + 2 :]) + # Parse inner text and skip parsing for this format pair + text_inner = text[cite_open + 2 : cite_close] + # For citations specifically, we may need to split off a citation + # target from the alias text + inner_split = text_inner.split("|", 1) + text_inner_actual, cite_target = inner_split[0], inner_split[-1] + spans_inner = parse_paired_formatting( + text_inner_actual, cite=False, bold=bold, italic=italic + ) + citation = CitationSpan(spans_inner, cite_target) + return [*spans_before, citation, *spans_after] + # Should never happen + return parse_breaks(text) -def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans: - bold_open = text.find("**") - if bold_open > -1: - bold_close = text.find("**", bold_open + 2) - # Should be no formatting behind us - spans_before = parse_breaks(text[:bold_open]) - # Freely parse formatting after us - spans_after = parse_paired_formatting(text[bold_close + 2:]) - # Parse inner text minus bold parsing - text_inner = text[bold_open + 2:bold_close] - spans_inner = parse_paired_formatting(text_inner, - cite=cite, bold=False, italic=italic) - bold = BoldSpan(spans_inner) - return [*spans_before, bold, *spans_after] - # Should never happen - return parse_italic(text) +def parse_bold( + text: str, + cite: bool = True, + italic: bool = True, +) -> Spans: + bold_open = text.find("**") + if bold_open > -1: + bold_close = text.find("**", bold_open + 2) + # Should be no formatting behind us + spans_before = parse_breaks(text[:bold_open]) + # Freely parse formatting after us + spans_after = parse_paired_formatting(text[bold_close + 2 :]) + # Parse inner text minus bold parsing + text_inner = text[bold_open + 2 : bold_close] + spans_inner = parse_paired_formatting( + text_inner, cite=cite, bold=False, italic=italic + ) + bold = BoldSpan(spans_inner) + return [*spans_before, bold, *spans_after] + # Should never happen + return parse_italic(text) -def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans: - italic_open = text.find("//") - if italic_open > -1: - italic_close = text.find("//", italic_open + 2) - # Should be no formatting behind us - spans_before = parse_breaks(text[:italic_open]) - # Freely parse formatting after us - spans_after = parse_paired_formatting(text[italic_close + 2:]) - # Parse inner text minus italic parsing - text_inner = text[italic_open + 2:italic_close] - spans_inner = parse_paired_formatting(text_inner, - cite=cite, bold=bold, italic=False) - italic = ItalicSpan(spans_inner) - return [*spans_before, italic, *spans_after] - # Should never happen - return parse_breaks(text) +def parse_italic( + text: str, + cite: bool = True, + bold: bool = True, +) -> Spans: + italic_open = text.find("//") + if italic_open > -1: + italic_close = text.find("//", italic_open + 2) + # Should be no formatting behind us + spans_before = parse_breaks(text[:italic_open]) + # Freely parse formatting after us + spans_after = parse_paired_formatting(text[italic_close + 2 :]) + # Parse inner text minus italic parsing + text_inner = text[italic_open + 2 : italic_close] + spans_inner = parse_paired_formatting( + text_inner, cite=cite, bold=bold, italic=False + ) + italic = ItalicSpan(spans_inner) + return [*spans_before, italic, *spans_after] + # Should never happen + return parse_breaks(text) def parse_breaks(text: str) -> Spans: - if not text: - return [] - splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) - spans: Spans = [ - splits[i // 2] if i % 2 == 0 else LineBreak() - for i in range(0, 2 * len(splits) - 1) - ] - return spans + if not text: + return [] + splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) + spans: Spans = [ + splits[i // 2] if i % 2 == 0 else LineBreak() + for i in range(0, 2 * len(splits) - 1) + ] + return spans diff --git a/mypy.ini b/mypy.ini index 0d8ecb7..febf6cd 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,4 +1,4 @@ [mypy] ignore_missing_imports = true -exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" +exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" ; mypy stable doesn't support pyproject.toml yet \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 0f28f9a..1070144 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,11 +17,11 @@ black = "^21.5b2" mypy = "^0.812" [tool.black] -extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/parser/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py" +extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py" [tool.mypy] ignore_missing_imports = true -exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" +exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" [tool.pytest.ini_options] addopts = "--show-capture=log"