From 7eadaa0db47641783bb917781ae7738f0414c738 Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 9 Jun 2021 15:39:14 -0700 Subject: [PATCH 1/8] Move analysis visitors out of parser module --- amanuensis/lexicon/gameloop.py | 43 ++++++++++++++++++++++++++--- amanuensis/parser/__init__.py | 3 --- amanuensis/parser/analyze.py | 49 ---------------------------------- 3 files changed, 40 insertions(+), 55 deletions(-) delete mode 100644 amanuensis/parser/analyze.py diff --git a/amanuensis/lexicon/gameloop.py b/amanuensis/lexicon/gameloop.py index 1ce2072..3e51150 100644 --- a/amanuensis/lexicon/gameloop.py +++ b/amanuensis/lexicon/gameloop.py @@ -9,11 +9,48 @@ from amanuensis.config import ReadOnlyOrderedDict from amanuensis.models import LexiconModel, UserModel from amanuensis.parser import ( parse_raw_markdown, - GetCitations, HtmlRenderer, titlesort, - filesafe_title, - ConstraintAnalysis) + filesafe_title) +from amanuensis.parser.core import RenderableVisitor + + +class GetCitations(RenderableVisitor): + def __init__(self): + self.citations = [] + + def ParsedArticle(self, span): + span.recurse(self) + return self.citations + + def CitationSpan(self, span): + self.citations.append(span.cite_target) + return self + + +class ConstraintAnalysis(RenderableVisitor): + def __init__(self, lexicon: LexiconModel): + self.info: List[str] = [] + self.warning: List[str] = [] + self.error: List[str] = [] + + self.word_count: int = 0 + self.citations: list = [] + self.signatures: int = 0 + + def TextSpan(self, span): + self.word_count += len(re.split(r'\s+', span.innertext.strip())) + return self + + def SignatureParagraph(self, span): + self.signatures += 1 + span.recurse(self) + return self + + def CitationSpan(self, span): + self.citations.append(span.cite_target) + span.recurse(self) + return self def get_player_characters( diff --git a/amanuensis/parser/__init__.py b/amanuensis/parser/__init__.py index 1de2c5d..5ef2072 100644 --- a/amanuensis/parser/__init__.py +++ b/amanuensis/parser/__init__.py @@ -2,15 +2,12 @@ Module encapsulating all markdown parsing functionality. """ -from .analyze import ConstraintAnalysis, GetCitations from .core import normalize_title from .helpers import titlesort, filesafe_title from .parsing import parse_raw_markdown from .render import PreviewHtmlRenderer, HtmlRenderer __all__ = [ - ConstraintAnalysis.__name__, - GetCitations.__name__, normalize_title.__name__, titlesort.__name__, filesafe_title.__name__, diff --git a/amanuensis/parser/analyze.py b/amanuensis/parser/analyze.py deleted file mode 100644 index bf52354..0000000 --- a/amanuensis/parser/analyze.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Internal module encapsulating visitors that compute metrics on articles -for verification against constraints. -""" - -import re -from typing import List - -from amanuensis.models import LexiconModel - -from .core import RenderableVisitor - - -class GetCitations(RenderableVisitor): - def __init__(self): - self.citations = [] - - def ParsedArticle(self, span): - span.recurse(self) - return self.citations - - def CitationSpan(self, span): - self.citations.append(span.cite_target) - return self - - -class ConstraintAnalysis(RenderableVisitor): - def __init__(self, lexicon: LexiconModel): - self.info: List[str] = [] - self.warning: List[str] = [] - self.error: List[str] = [] - - self.word_count: int = 0 - self.citations: list = [] - self.signatures: int = 0 - - def TextSpan(self, span): - self.word_count += len(re.split(r'\s+', span.innertext.strip())) - return self - - def SignatureParagraph(self, span): - self.signatures += 1 - span.recurse(self) - return self - - def CitationSpan(self, span): - self.citations.append(span.cite_target) - span.recurse(self) - return self -- 2.44.1 From ffa27be86dcb7392f055daf05d9889d492a4585d Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 9 Jun 2021 15:47:54 -0700 Subject: [PATCH 2/8] Move render visitors out of parser module --- amanuensis/lexicon/gameloop.py | 48 +++++++++++- amanuensis/lexicon/manage.py | 2 +- amanuensis/parser/__init__.py | 3 - amanuensis/parser/render.py | 104 -------------------------- amanuensis/server/session/__init__.py | 6 +- amanuensis/server/session/editor.py | 52 ++++++++++++- 6 files changed, 100 insertions(+), 115 deletions(-) delete mode 100644 amanuensis/parser/render.py diff --git a/amanuensis/lexicon/gameloop.py b/amanuensis/lexicon/gameloop.py index 3e51150..d606d57 100644 --- a/amanuensis/lexicon/gameloop.py +++ b/amanuensis/lexicon/gameloop.py @@ -9,7 +9,6 @@ from amanuensis.config import ReadOnlyOrderedDict from amanuensis.models import LexiconModel, UserModel from amanuensis.parser import ( parse_raw_markdown, - HtmlRenderer, titlesort, filesafe_title) from amanuensis.parser.core import RenderableVisitor @@ -53,6 +52,53 @@ class ConstraintAnalysis(RenderableVisitor): return self +class HtmlRenderer(RenderableVisitor): + """ + Renders an article token tree into published article HTML. + """ + def __init__(self, lexicon_name: str, written_articles: Iterable[str]): + self.lexicon_name: str = lexicon_name + self.written_articles: Iterable[str] = written_articles + + def TextSpan(self, span): + return span.innertext + + def LineBreak(self, span): + return '
' + + def ParsedArticle(self, span): + return '\n'.join(span.recurse(self)) + + def BodyParagraph(self, span): + return f'

{"".join(span.recurse(self))}

' + + def SignatureParagraph(self, span): + return ( + '

' + f'{"".join(span.recurse(self))}' + '

' + ) + + def BoldSpan(self, span): + return f'{"".join(span.recurse(self))}' + + def ItalicSpan(self, span): + return f'{"".join(span.recurse(self))}' + + def CitationSpan(self, span): + if span.cite_target in self.written_articles: + link_class = '' + else: + link_class = ' class="phantom"' + # link = url_for( + # 'lexicon.article', + # name=self.lexicon_name, + # title=filesafe_title(span.cite_target)) + link = (f'/lexicon/{self.lexicon_name}' + + f'/article/{filesafe_title(span.cite_target)}') + return f'{"".join(span.recurse(self))}' + + def get_player_characters( lexicon: LexiconModel, uid: str) -> Iterable[ReadOnlyOrderedDict]: diff --git a/amanuensis/lexicon/manage.py b/amanuensis/lexicon/manage.py index bdfbeb0..eb7844b 100644 --- a/amanuensis/lexicon/manage.py +++ b/amanuensis/lexicon/manage.py @@ -13,7 +13,7 @@ # from amanuensis.config.loader import AttrOrderedDict # from amanuensis.errors import ArgumentError # from amanuensis.lexicon import LexiconModel -# from amanuensis.parser import parse_raw_markdown, GetCitations, HtmlRenderer, filesafe_title, titlesort +# from amanuensis.parser import parse_raw_markdown, filesafe_title, titlesort # from amanuensis.resources import get_stream diff --git a/amanuensis/parser/__init__.py b/amanuensis/parser/__init__.py index 5ef2072..aff1bd4 100644 --- a/amanuensis/parser/__init__.py +++ b/amanuensis/parser/__init__.py @@ -5,13 +5,10 @@ Module encapsulating all markdown parsing functionality. from .core import normalize_title from .helpers import titlesort, filesafe_title from .parsing import parse_raw_markdown -from .render import PreviewHtmlRenderer, HtmlRenderer __all__ = [ normalize_title.__name__, titlesort.__name__, filesafe_title.__name__, parse_raw_markdown.__name__, - PreviewHtmlRenderer.__name__, - HtmlRenderer.__name__, ] diff --git a/amanuensis/parser/render.py b/amanuensis/parser/render.py deleted file mode 100644 index 9313c07..0000000 --- a/amanuensis/parser/render.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -Internal module encapsulating visitors that render articles into -readable formats. -""" - -from typing import Iterable - -from .core import RenderableVisitor -from .helpers import filesafe_title - - -class HtmlRenderer(RenderableVisitor): - """ - Renders an article token tree into published article HTML. - """ - def __init__(self, lexicon_name: str, written_articles: Iterable[str]): - self.lexicon_name: str = lexicon_name - self.written_articles: Iterable[str] = written_articles - - def TextSpan(self, span): - return span.innertext - - def LineBreak(self, span): - return '
' - - def ParsedArticle(self, span): - return '\n'.join(span.recurse(self)) - - def BodyParagraph(self, span): - return f'

{"".join(span.recurse(self))}

' - - def SignatureParagraph(self, span): - return ( - '

' - f'{"".join(span.recurse(self))}' - '

' - ) - - def BoldSpan(self, span): - return f'{"".join(span.recurse(self))}' - - def ItalicSpan(self, span): - return f'{"".join(span.recurse(self))}' - - def CitationSpan(self, span): - if span.cite_target in self.written_articles: - link_class = '' - else: - link_class = ' class="phantom"' - # link = url_for( - # 'lexicon.article', - # name=self.lexicon_name, - # title=filesafe_title(span.cite_target)) - link = (f'/lexicon/{self.lexicon_name}' - + f'/article/{filesafe_title(span.cite_target)}') - return f'{"".join(span.recurse(self))}' - - -class PreviewHtmlRenderer(RenderableVisitor): - def __init__(self, lexicon): - with lexicon.ctx.read('info') as info: - self.article_map = { - title: article.character - for title, article in info.items() - } - self.citations = [] - self.contents = "" - - def TextSpan(self, span): - return span.innertext - - def LineBreak(self, span): - return '
' - - def ParsedArticle(self, span): - self.contents = '\n'.join(span.recurse(self)) - return self - - def BodyParagraph(self, span): - return f'

{"".join(span.recurse(self))}

' - - def SignatureParagraph(self, span): - return ( - '

' - f'{"".join(span.recurse(self))}' - '

' - ) - - def BoldSpan(self, span): - return f'{"".join(span.recurse(self))}' - - def ItalicSpan(self, span): - return f'{"".join(span.recurse(self))}' - - def CitationSpan(self, span): - if span.cite_target in self.article_map: - if self.article_map.get(span.cite_target): - link_class = '[extant]' - else: - link_class = '[phantom]' - else: - link_class = '[new]' - self.citations.append(f'{span.cite_target} {link_class}') - return f'{"".join(span.recurse(self))}[{len(self.citations)}]' diff --git a/amanuensis/server/session/__init__.py b/amanuensis/server/session/__init__.py index 4c27787..743754d 100644 --- a/amanuensis/server/session/__init__.py +++ b/amanuensis/server/session/__init__.py @@ -15,9 +15,7 @@ from amanuensis.lexicon import ( create_character_in_lexicon, get_draft) from amanuensis.models import LexiconModel -from amanuensis.parser import ( - parse_raw_markdown, - PreviewHtmlRenderer) +from amanuensis.parser import parse_raw_markdown from amanuensis.server.helpers import ( lexicon_param, player_required, @@ -29,7 +27,7 @@ from .forms import ( LexiconPublishTurnForm, LexiconConfigForm) -from .editor import load_editor, new_draft, update_draft +from .editor import load_editor, new_draft, update_draft, PreviewHtmlRenderer bp_session = Blueprint('session', __name__, diff --git a/amanuensis/server/session/editor.py b/amanuensis/server/session/editor.py index 8492966..79a3cb3 100644 --- a/amanuensis/server/session/editor.py +++ b/amanuensis/server/session/editor.py @@ -17,8 +17,56 @@ from amanuensis.lexicon import ( from amanuensis.models import LexiconModel from amanuensis.parser import ( normalize_title, - parse_raw_markdown, - PreviewHtmlRenderer) + parse_raw_markdown) +from amanuensis.parser.core import RenderableVisitor + + +class PreviewHtmlRenderer(RenderableVisitor): + def __init__(self, lexicon): + with lexicon.ctx.read('info') as info: + self.article_map = { + title: article.character + for title, article in info.items() + } + self.citations = [] + self.contents = "" + + def TextSpan(self, span): + return span.innertext + + def LineBreak(self, span): + return '
' + + def ParsedArticle(self, span): + self.contents = '\n'.join(span.recurse(self)) + return self + + def BodyParagraph(self, span): + return f'

{"".join(span.recurse(self))}

' + + def SignatureParagraph(self, span): + return ( + '

' + f'{"".join(span.recurse(self))}' + '

' + ) + + def BoldSpan(self, span): + return f'{"".join(span.recurse(self))}' + + def ItalicSpan(self, span): + return f'{"".join(span.recurse(self))}' + + def CitationSpan(self, span): + if span.cite_target in self.article_map: + if self.article_map.get(span.cite_target): + link_class = '[extant]' + else: + link_class = '[phantom]' + else: + link_class = '[new]' + self.citations.append(f'{span.cite_target} {link_class}') + return f'{"".join(span.recurse(self))}[{len(self.citations)}]' def load_editor(lexicon: LexiconModel, aid: str): -- 2.44.1 From 1c55d866a8054cc5627a342f453046a1fd7f007e Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 9 Jun 2021 15:57:48 -0700 Subject: [PATCH 3/8] Reorganize parser and style pass --- amanuensis/parser/__init__.py | 13 +- amanuensis/parser/core.py | 167 ++++++++++----------- amanuensis/parser/helpers.py | 65 ++++++--- amanuensis/parser/parsing.py | 263 ++++++++++++++++++---------------- mypy.ini | 2 +- pyproject.toml | 4 +- 6 files changed, 280 insertions(+), 234 deletions(-) diff --git a/amanuensis/parser/__init__.py b/amanuensis/parser/__init__.py index aff1bd4..7aa5bd7 100644 --- a/amanuensis/parser/__init__.py +++ b/amanuensis/parser/__init__.py @@ -2,13 +2,14 @@ Module encapsulating all markdown parsing functionality. """ -from .core import normalize_title -from .helpers import titlesort, filesafe_title +from .core import RenderableVisitor +from .helpers import normalize_title, filesafe_title, titlesort from .parsing import parse_raw_markdown __all__ = [ - normalize_title.__name__, - titlesort.__name__, - filesafe_title.__name__, - parse_raw_markdown.__name__, + "RenderableVisitor", + "normalize_title", + "filesafe_title", + "titlesort", + "parse_raw_markdown", ] diff --git a/amanuensis/parser/core.py b/amanuensis/parser/core.py index 76f15de..d50049a 100644 --- a/amanuensis/parser/core.py +++ b/amanuensis/parser/core.py @@ -5,131 +5,134 @@ which can be operated on by a visitor defining functions that hook off of the different token types. """ -import re from typing import Callable, Any, Sequence -RenderHook = Callable[['Renderable'], Any] -Spans = Sequence['Renderable'] +from .helpers import normalize_title -def normalize_title(title: str) -> str: - """ - Normalizes strings as titles: - - Strips leading and trailing whitespace - - Merges internal whitespace into a single space - - Capitalizes the first word - """ - cleaned = re.sub(r'\s+', " ", title.strip()) - return cleaned[:1].capitalize() + cleaned[1:] +RenderHook = Callable[["Renderable"], Any] +Spans = Sequence["Renderable"] -class Renderable(): - """ - Base class for parsed markdown. Provides the `render()` method for - visiting the token tree. - """ - def render(self: 'Renderable', renderer: 'RenderableVisitor'): - """ - Execute the apppropriate visitor method on this Renderable. - """ - hook: RenderHook = getattr(renderer, type(self).__name__, None) - if hook: - return hook(self) - return None +class Renderable: + """ + Base class for parsed markdown. Provides the `render()` method for + visiting the token tree. + """ + + def render(self: "Renderable", renderer: "RenderableVisitor"): + """ + Execute the apppropriate visitor method on this Renderable. + Visitors implement hooks by declaring methods whose names are + the name of a Renderable class. + """ + hook: RenderHook = getattr(renderer, type(self).__name__, None) + if hook: + return hook(self) + return None class TextSpan(Renderable): - """An unstyled length of text.""" - def __init__(self, innertext: str): - self.innertext = innertext + """A length of text.""" - def __str__(self): - return f"[{self.innertext}]" + def __init__(self, innertext: str): + self.innertext = innertext + + def __str__(self): + return f"[{self.innertext}]" class LineBreak(Renderable): - """A line break within a paragraph.""" - def __str__(self): - return "" + """A line break within a paragraph.""" + + def __str__(self): + return "" class SpanContainer(Renderable): - """A formatting element that wraps some amount of text.""" - def __init__(self, spans: Spans): - self.spans: Spans = spans + """A formatting element that wraps some amount of text.""" - def __str__(self): - return (f'[{type(self).__name__} ' - + f'{" ".join([str(span) for span in self.spans])}]') + def __init__(self, spans: Spans): + self.spans: Spans = spans - def recurse(self, renderer: 'RenderableVisitor'): - return [child.render(renderer) for child in self.spans] + def __str__(self): + return ( + f"[{type(self).__name__} " + + f'{" ".join([str(span) for span in self.spans])}]' + ) + + def recurse(self, renderer: "RenderableVisitor"): + return [child.render(renderer) for child in self.spans] class ParsedArticle(SpanContainer): - """Token tree root node, containing some number of paragraph tokens.""" + """Token tree root node, containing some number of paragraph tokens.""" class BodyParagraph(SpanContainer): - """A normal paragraph.""" + """A normal paragraph.""" class SignatureParagraph(SpanContainer): - """A paragraph preceded by a signature mark.""" + """A paragraph preceded by a signature mark.""" class BoldSpan(SpanContainer): - """A span of text inside bold marks.""" + """A span of text inside bold marks.""" class ItalicSpan(SpanContainer): - """A span of text inside italic marks.""" + """A span of text inside italic marks.""" class CitationSpan(SpanContainer): - """A citation to another article.""" - def __init__(self, spans: Spans, cite_target: str): - super().__init__(spans) - # Normalize citation target on parse, since we don't want - # abnormal title strings lying around causing trouble. - self.cite_target: str = normalize_title(cite_target) + """A citation to another article.""" - def __str__(self): - return (f'{{{" ".join([str(span) for span in self.spans])}' - + f':{self.cite_target}}}') + def __init__(self, spans: Spans, cite_target: str): + super().__init__(spans) + # Normalize citation target on parse, since we don't want + # abnormal title strings lying around causing trouble. + self.cite_target: str = normalize_title(cite_target) + + def __str__(self) -> str: + return ( + f'{{{" ".join([str(span) for span in self.spans])}' + + f":{self.cite_target}}}" + ) -class RenderableVisitor(): - """ - Default implementation of the visitor pattern. Executes once on - each token in the tree and returns itself. - """ - def TextSpan(self, span: TextSpan): - return self +class RenderableVisitor: + """ + Default implementation of the visitor pattern. Executes once on + each token in the tree and returns itself. + """ - def LineBreak(self, span: LineBreak): - return self + def TextSpan(self, span: TextSpan): + return self - def ParsedArticle(self, span: ParsedArticle): - span.recurse(self) - return self + def LineBreak(self, span: LineBreak): + return self - def BodyParagraph(self, span: BodyParagraph): - span.recurse(self) - return self + def ParsedArticle(self, span: ParsedArticle): + span.recurse(self) + return self - def SignatureParagraph(self, span: SignatureParagraph): - span.recurse(self) - return self + def BodyParagraph(self, span: BodyParagraph): + span.recurse(self) + return self - def BoldSpan(self, span: BoldSpan): - span.recurse(self) - return self + def SignatureParagraph(self, span: SignatureParagraph): + span.recurse(self) + return self - def ItalicSpan(self, span: ItalicSpan): - span.recurse(self) - return self + def BoldSpan(self, span: BoldSpan): + span.recurse(self) + return self - def CitationSpan(self, span: CitationSpan): - span.recurse(self) - return self + def ItalicSpan(self, span: ItalicSpan): + span.recurse(self) + return self + + def CitationSpan(self, span: CitationSpan): + span.recurse(self) + return self diff --git a/amanuensis/parser/helpers.py b/amanuensis/parser/helpers.py index e6eabfb..7f2123d 100644 --- a/amanuensis/parser/helpers.py +++ b/amanuensis/parser/helpers.py @@ -1,28 +1,53 @@ +""" +Helper functions for manipulating titles during parsing +""" + import re import urllib.parse +def normalize_title(title: str) -> str: + """ + Normalizes strings as titles: + - Strips leading and trailing whitespace + - Merges internal whitespace into a single space + - Capitalizes the first word + """ + cleaned = re.sub(r"\s+", " ", title.strip()) + return cleaned[:1].capitalize() + cleaned[1:] + + def titlesort(title: str) -> str: - """ - Strips articles off of titles for alphabetical sorting purposes - """ - lower = title.lower() - if lower.startswith("the "): - return lower[4:] - if lower.startswith("an "): - return lower[3:] - if lower.startswith("a "): - return lower[2:] - return lower + """ + Strips articles off of titles for alphabetical sorting purposes + """ + lower = title.lower() + if lower.startswith("the "): + return lower[4:] + if lower.startswith("an "): + return lower[3:] + if lower.startswith("a "): + return lower[2:] + return lower def filesafe_title(title: str) -> str: - """ - Makes an article title filename-safe. - """ - s = re.sub(r"\s+", '_', title) # Replace whitespace with _ - s = re.sub(r"~", '-', s) # parse.quote doesn't catch ~ - s = urllib.parse.quote(s) # Encode all other characters - s = re.sub(r"%", "", s) # Strip encoding %s - s = s[:64] # Limit to 64 characters - return s + """ + Makes an article title filename-safe. + """ + # Replace whitespace with _ + s = re.sub(r"\s+", "_", title) + + # parse.quote doesn't catch ~ + s = re.sub(r"~", "-", s) + + # Encode all other characters + s = urllib.parse.quote(s) + + # Strip encoding %s + s = re.sub(r"%", "", s) + + # Limit to 64 characters + s = s[:64] + + return s diff --git a/amanuensis/parser/parsing.py b/amanuensis/parser/parsing.py index 4e0695f..c6bb50b 100644 --- a/amanuensis/parser/parsing.py +++ b/amanuensis/parser/parsing.py @@ -7,150 +7,167 @@ import re from typing import Sequence from .core import ( - TextSpan, - LineBreak, - ParsedArticle, - BodyParagraph, - SignatureParagraph, - BoldSpan, - ItalicSpan, - CitationSpan, - Renderable, - SpanContainer + TextSpan, + LineBreak, + ParsedArticle, + BodyParagraph, + SignatureParagraph, + BoldSpan, + ItalicSpan, + CitationSpan, + Renderable, + SpanContainer, ) Spans = Sequence[Renderable] def parse_raw_markdown(text: str) -> ParsedArticle: - """ - Parses a body of Lexipython markdown into a Renderable tree. - """ - # Parse each paragraph individually, as no formatting applies - # across paragraphs - paragraphs = re.split(r'\n\n+', text) - parse_results = list(map(parse_paragraph, paragraphs)) - return ParsedArticle(parse_results) + """ + Parses a body of Lexipython markdown into a Renderable tree. + """ + # Parse each paragraph individually, as no formatting applies + # across paragraphs + paragraphs = re.split(r"\n\n+", text) + parse_results = list(map(parse_paragraph, paragraphs)) + return ParsedArticle(parse_results) def parse_paragraph(text: str) -> SpanContainer: - # Parse the paragraph as a span of text - text = text.strip() - if text and text[0] == '~': - return SignatureParagraph(parse_paired_formatting(text[1:])) - else: - return BodyParagraph(parse_paired_formatting(text)) + # Parse the paragraph as a span of text + text = text.strip() + if text and text[0] == "~": + return SignatureParagraph(parse_paired_formatting(text[1:])) + else: + return BodyParagraph(parse_paired_formatting(text)) def parse_paired_formatting( - text: str, - cite: bool = True, - bold: bool = True, - italic: bool = True) -> Spans: - # Find positions of any paired formatting - first_cite = find_pair(text, "[[", "]]", cite) - first_bold = find_pair(text, "**", "**", bold) - first_italic = find_pair(text, "//", "//", italic) - # Load the possible parse handlers into the map - handlers = {} - handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic) - handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic) - handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold) - # If nothing was found, move on to the next parsing step - handlers[-1] = lambda: parse_breaks(text) - # Choose a handler based on the earliest found result - finds = [i for i in (first_cite, first_bold, first_italic) if i > -1] - first = min(finds) if finds else -1 - return handlers[first]() + text: str, + cite: bool = True, + bold: bool = True, + italic: bool = True, +) -> Spans: + # Find positions of any paired formatting + first_cite = find_pair(text, "[[", "]]", cite) + first_bold = find_pair(text, "**", "**", bold) + first_italic = find_pair(text, "//", "//", italic) + # Load the possible parse handlers into the map + handlers = {} + handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic) + handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic) + handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold) + # If nothing was found, move on to the next parsing step + handlers[-1] = lambda: parse_breaks(text) + # Choose a handler based on the earliest found result + finds = [i for i in (first_cite, first_bold, first_italic) if i > -1] + first = min(finds) if finds else -1 + return handlers[first]() def find_pair( - text: str, - open_tag: str, - close_tag: str, - valid: bool) -> int: - # If skipping, return -1 - if not valid: - return -1 - # If the open tag wasn't found, return -1 - first = text.find(open_tag) - if first < 0: - return -1 - # If the close tag wasn't found after the open tag, return -1 - second = text.find(close_tag, first + len(open_tag)) - if second < 0: - return -1 - # Otherwise, the pair exists - return first + text: str, + open_tag: str, + close_tag: str, + valid: bool, +) -> int: + # If skipping, return -1 + if not valid: + return -1 + # If the open tag wasn't found, return -1 + first = text.find(open_tag) + if first < 0: + return -1 + # If the close tag wasn't found after the open tag, return -1 + second = text.find(close_tag, first + len(open_tag)) + if second < 0: + return -1 + # Otherwise, the pair exists + return first -def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans: - cite_open = text.find("[[") - if cite_open > -1: - cite_close = text.find("]]", cite_open + 2) - # Since we searched for pairs from the beginning, there should be no - # undetected pair formatting before this one, so move to the next - # level of parsing - spans_before = parse_breaks(text[:cite_open]) - # Continue parsing pair formatting after this one closes with all - # three as valid choices - spans_after = parse_paired_formatting(text[cite_close + 2:]) - # Parse inner text and skip parsing for this format pair - text_inner = text[cite_open + 2:cite_close] - # For citations specifically, we may need to split off a citation - # target from the alias text - inner_split = text_inner.split("|", 1) - text_inner_actual, cite_target = inner_split[0], inner_split[-1] - spans_inner = parse_paired_formatting(text_inner_actual, - cite=False, bold=bold, italic=italic) - citation = CitationSpan(spans_inner, cite_target) - return [*spans_before, citation, *spans_after] - # Should never happen - return parse_breaks(text) +def parse_citation( + text: str, + bold: bool = True, + italic: bool = True, +) -> Spans: + cite_open = text.find("[[") + if cite_open > -1: + cite_close = text.find("]]", cite_open + 2) + # Since we searched for pairs from the beginning, there should be no + # undetected pair formatting before this one, so move to the next + # level of parsing + spans_before = parse_breaks(text[:cite_open]) + # Continue parsing pair formatting after this one closes with all + # three as valid choices + spans_after = parse_paired_formatting(text[cite_close + 2 :]) + # Parse inner text and skip parsing for this format pair + text_inner = text[cite_open + 2 : cite_close] + # For citations specifically, we may need to split off a citation + # target from the alias text + inner_split = text_inner.split("|", 1) + text_inner_actual, cite_target = inner_split[0], inner_split[-1] + spans_inner = parse_paired_formatting( + text_inner_actual, cite=False, bold=bold, italic=italic + ) + citation = CitationSpan(spans_inner, cite_target) + return [*spans_before, citation, *spans_after] + # Should never happen + return parse_breaks(text) -def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans: - bold_open = text.find("**") - if bold_open > -1: - bold_close = text.find("**", bold_open + 2) - # Should be no formatting behind us - spans_before = parse_breaks(text[:bold_open]) - # Freely parse formatting after us - spans_after = parse_paired_formatting(text[bold_close + 2:]) - # Parse inner text minus bold parsing - text_inner = text[bold_open + 2:bold_close] - spans_inner = parse_paired_formatting(text_inner, - cite=cite, bold=False, italic=italic) - bold = BoldSpan(spans_inner) - return [*spans_before, bold, *spans_after] - # Should never happen - return parse_italic(text) +def parse_bold( + text: str, + cite: bool = True, + italic: bool = True, +) -> Spans: + bold_open = text.find("**") + if bold_open > -1: + bold_close = text.find("**", bold_open + 2) + # Should be no formatting behind us + spans_before = parse_breaks(text[:bold_open]) + # Freely parse formatting after us + spans_after = parse_paired_formatting(text[bold_close + 2 :]) + # Parse inner text minus bold parsing + text_inner = text[bold_open + 2 : bold_close] + spans_inner = parse_paired_formatting( + text_inner, cite=cite, bold=False, italic=italic + ) + bold = BoldSpan(spans_inner) + return [*spans_before, bold, *spans_after] + # Should never happen + return parse_italic(text) -def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans: - italic_open = text.find("//") - if italic_open > -1: - italic_close = text.find("//", italic_open + 2) - # Should be no formatting behind us - spans_before = parse_breaks(text[:italic_open]) - # Freely parse formatting after us - spans_after = parse_paired_formatting(text[italic_close + 2:]) - # Parse inner text minus italic parsing - text_inner = text[italic_open + 2:italic_close] - spans_inner = parse_paired_formatting(text_inner, - cite=cite, bold=bold, italic=False) - italic = ItalicSpan(spans_inner) - return [*spans_before, italic, *spans_after] - # Should never happen - return parse_breaks(text) +def parse_italic( + text: str, + cite: bool = True, + bold: bool = True, +) -> Spans: + italic_open = text.find("//") + if italic_open > -1: + italic_close = text.find("//", italic_open + 2) + # Should be no formatting behind us + spans_before = parse_breaks(text[:italic_open]) + # Freely parse formatting after us + spans_after = parse_paired_formatting(text[italic_close + 2 :]) + # Parse inner text minus italic parsing + text_inner = text[italic_open + 2 : italic_close] + spans_inner = parse_paired_formatting( + text_inner, cite=cite, bold=bold, italic=False + ) + italic = ItalicSpan(spans_inner) + return [*spans_before, italic, *spans_after] + # Should never happen + return parse_breaks(text) def parse_breaks(text: str) -> Spans: - if not text: - return [] - splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) - spans: Spans = [ - splits[i // 2] if i % 2 == 0 else LineBreak() - for i in range(0, 2 * len(splits) - 1) - ] - return spans + if not text: + return [] + splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) + spans: Spans = [ + splits[i // 2] if i % 2 == 0 else LineBreak() + for i in range(0, 2 * len(splits) - 1) + ] + return spans diff --git a/mypy.ini b/mypy.ini index 0d8ecb7..febf6cd 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,4 +1,4 @@ [mypy] ignore_missing_imports = true -exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" +exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" ; mypy stable doesn't support pyproject.toml yet \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 0f28f9a..1070144 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,11 +17,11 @@ black = "^21.5b2" mypy = "^0.812" [tool.black] -extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/parser/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py" +extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py" [tool.mypy] ignore_missing_imports = true -exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" +exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" [tool.pytest.ini_options] addopts = "--show-capture=log" -- 2.44.1 From 7a847e96d3403b91c67abd4d02bd893c008f7d52 Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 9 Jun 2021 18:20:23 -0700 Subject: [PATCH 4/8] Add unit tests for line breaks and simple pairs --- amanuensis/parser/core.py | 16 ++-- amanuensis/parser/parsing.py | 76 +++++++++------ tests/test_parser.py | 176 +++++++++++++++++++++++++++++++++++ 3 files changed, 233 insertions(+), 35 deletions(-) create mode 100644 tests/test_parser.py diff --git a/amanuensis/parser/core.py b/amanuensis/parser/core.py index d50049a..cd1b6a1 100644 --- a/amanuensis/parser/core.py +++ b/amanuensis/parser/core.py @@ -38,14 +38,14 @@ class TextSpan(Renderable): def __init__(self, innertext: str): self.innertext = innertext - def __str__(self): - return f"[{self.innertext}]" + def __repr__(self): + return f"<{self.innertext}>" class LineBreak(Renderable): """A line break within a paragraph.""" - def __str__(self): + def __repr__(self): return "" @@ -55,10 +55,10 @@ class SpanContainer(Renderable): def __init__(self, spans: Spans): self.spans: Spans = spans - def __str__(self): + def __repr__(self): return ( - f"[{type(self).__name__} " - + f'{" ".join([str(span) for span in self.spans])}]' + f"<{type(self).__name__} " + + f'{" ".join([repr(span) for span in self.spans])}>' ) def recurse(self, renderer: "RenderableVisitor"): @@ -94,9 +94,9 @@ class CitationSpan(SpanContainer): # abnormal title strings lying around causing trouble. self.cite_target: str = normalize_title(cite_target) - def __str__(self) -> str: + def __repr__(self) -> str: return ( - f'{{{" ".join([str(span) for span in self.spans])}' + f'{{{" ".join([repr(span) for span in self.spans])}' + f":{self.cite_target}}}" ) diff --git a/amanuensis/parser/parsing.py b/amanuensis/parser/parsing.py index c6bb50b..a16afae 100644 --- a/amanuensis/parser/parsing.py +++ b/amanuensis/parser/parsing.py @@ -34,6 +34,9 @@ def parse_raw_markdown(text: str) -> ParsedArticle: def parse_paragraph(text: str) -> SpanContainer: + """ + Parses a block of text into a paragraph object. + """ # Parse the paragraph as a span of text text = text.strip() if text and text[0] == "~": @@ -44,19 +47,28 @@ def parse_paragraph(text: str) -> SpanContainer: def parse_paired_formatting( text: str, - cite: bool = True, - bold: bool = True, - italic: bool = True, + can_cite: bool = True, + can_bold: bool = True, + can_italic: bool = True, ) -> Spans: + """ + Parses citations, bolds, and italics, which can be nested inside each other. + """ # Find positions of any paired formatting - first_cite = find_pair(text, "[[", "]]", cite) - first_bold = find_pair(text, "**", "**", bold) - first_italic = find_pair(text, "//", "//", italic) + first_cite = find_pair(text, "[[", "]]") if can_cite else -1 + first_bold = find_pair(text, "**", "**") if can_bold else -1 + first_italic = find_pair(text, "//", "//") if can_italic else -1 # Load the possible parse handlers into the map handlers = {} - handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic) - handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic) - handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold) + handlers[first_cite] = lambda: parse_citation( + text, can_bold=can_bold, can_italic=can_italic + ) + handlers[first_bold] = lambda: parse_bold( + text, can_cite=can_cite, can_italic=can_italic + ) + handlers[first_italic] = lambda: parse_italic( + text, can_cite=can_cite, can_bold=can_bold + ) # If nothing was found, move on to the next parsing step handlers[-1] = lambda: parse_breaks(text) # Choose a handler based on the earliest found result @@ -65,15 +77,10 @@ def parse_paired_formatting( return handlers[first]() -def find_pair( - text: str, - open_tag: str, - close_tag: str, - valid: bool, -) -> int: - # If skipping, return -1 - if not valid: - return -1 +def find_pair(text: str, open_tag: str, close_tag: str) -> int: + """ + Finds the beginning of a pair of formatting marks. + """ # If the open tag wasn't found, return -1 first = text.find(open_tag) if first < 0: @@ -88,9 +95,12 @@ def find_pair( def parse_citation( text: str, - bold: bool = True, - italic: bool = True, + can_bold: bool = True, + can_italic: bool = True, ) -> Spans: + """ + Parses text into a citation span. + """ cite_open = text.find("[[") if cite_open > -1: cite_close = text.find("]]", cite_open + 2) @@ -108,7 +118,7 @@ def parse_citation( inner_split = text_inner.split("|", 1) text_inner_actual, cite_target = inner_split[0], inner_split[-1] spans_inner = parse_paired_formatting( - text_inner_actual, cite=False, bold=bold, italic=italic + text_inner_actual, can_cite=False, can_bold=can_bold, can_italic=can_italic ) citation = CitationSpan(spans_inner, cite_target) return [*spans_before, citation, *spans_after] @@ -118,9 +128,12 @@ def parse_citation( def parse_bold( text: str, - cite: bool = True, - italic: bool = True, + can_cite: bool = True, + can_italic: bool = True, ) -> Spans: + """ + Parses text into a bold span. + """ bold_open = text.find("**") if bold_open > -1: bold_close = text.find("**", bold_open + 2) @@ -131,7 +144,7 @@ def parse_bold( # Parse inner text minus bold parsing text_inner = text[bold_open + 2 : bold_close] spans_inner = parse_paired_formatting( - text_inner, cite=cite, bold=False, italic=italic + text_inner, can_cite=can_cite, can_bold=False, can_italic=can_italic ) bold = BoldSpan(spans_inner) return [*spans_before, bold, *spans_after] @@ -141,9 +154,12 @@ def parse_bold( def parse_italic( text: str, - cite: bool = True, - bold: bool = True, + can_cite: bool = True, + can_bold: bool = True, ) -> Spans: + """ + Parses text into an italic span. + """ italic_open = text.find("//") if italic_open > -1: italic_close = text.find("//", italic_open + 2) @@ -154,7 +170,7 @@ def parse_italic( # Parse inner text minus italic parsing text_inner = text[italic_open + 2 : italic_close] spans_inner = parse_paired_formatting( - text_inner, cite=cite, bold=bold, italic=False + text_inner, can_cite=can_cite, can_bold=can_bold, can_italic=False ) italic = ItalicSpan(spans_inner) return [*spans_before, italic, *spans_after] @@ -163,9 +179,15 @@ def parse_italic( def parse_breaks(text: str) -> Spans: + """ + Parses intra-paragraph line breaks. + """ + # Parse empty text into nothing if not text: return [] + # Split on the line break mark appearing at the end of the line splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) + # Put a LineBreak between each TextSpan spans: Spans = [ splits[i // 2] if i % 2 == 0 else LineBreak() for i in range(0, 2 * len(splits) - 1) diff --git a/tests/test_parser.py b/tests/test_parser.py new file mode 100644 index 0000000..5a27765 --- /dev/null +++ b/tests/test_parser.py @@ -0,0 +1,176 @@ +from typing import Sequence + +from amanuensis.parser.core import ( + TextSpan, + LineBreak, + ParsedArticle, + BodyParagraph, + SignatureParagraph, + BoldSpan, + ItalicSpan, + CitationSpan, + Renderable, + SpanContainer, + RenderableVisitor, + Spans, +) +from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort +from amanuensis.parser.parsing import ( + parse_breaks, + parse_paired_formatting, + parse_paragraph, + parse_raw_markdown, +) + + +def assert_types(spans: Spans, types: Sequence, loc=None): + """ + Asserts that a span list has the types specified. + Each element in `types` should be either a span type or a list. The first + element of the list is the container type and the remaining elements are the + content types. + """ + assert len(spans) == len( + types + ), f"Unexpected type sequence length at loc {loc if loc else 'root'}" + i = -1 + for span, span_type in zip(spans, types): + i += 1 + i_loc = f"{loc}.{i}" if loc else f"{i}" + if isinstance(span_type, list): + assert isinstance( + span, SpanContainer + ), f"Expected a span container at loc {i_loc}" + assert ( + len(span.spans) == len(span_type) - 1 + ), f"Unexpected container size at loc {i_loc}" + assert isinstance( + span, span_type[0] + ), f"Unexpected container type at loc {i_loc}" + assert_types(span.spans, span_type[1:], loc=i_loc) + else: + assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}" + assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}" + + +def assert_text(spans: Spans, texts: Sequence, loc=None): + """ + Asserts that a span list has the inner text structure specified. + Each element in `texts` should be either a string or a list of the same. + """ + assert len(spans) == len( + texts + ), f"Unexpected text sequence length at loc {loc if loc else 'root'}" + i = -1 + for span, text in zip(spans, texts): + i += 1 + i_loc = f"{loc}.{i}" if loc else f"{i}" + if isinstance(text, str): + assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}" + assert span.innertext == text, f"Unexpected text at loc {i_loc}" + elif isinstance(text, list): + assert isinstance( + span, SpanContainer + ), f"Expected a span container at loc {i_loc}" + assert_text(span.spans, text, loc=i_loc) + else: + assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}" + + +def test_parse_breaks(): + """Test parsing for intra-pragraph line break""" + text: str + spans: Spans + + # Only having a line break does nothing + text = "One\nTwo" + spans: Spans = parse_breaks(text) + assert_types(spans, [TextSpan]) + assert_text(spans, [text]) + + # Having the mark causes the text to be split across it + text = r"One\\" + "\nTwo" + spans: Spans = parse_breaks(text) + assert_types(spans, [TextSpan, LineBreak, TextSpan]) + assert_text(spans, ["One", None, "Two"]) + + # Multiple lines can be broken + text = r"One\\" + "\n" + r"Two\\" + "\nThree" + spans: Spans = parse_breaks(text) + assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan]) + assert_text(spans, ["One", None, "Two", None, "Three"]) + + # The mark must be at the end of the line + text = r"One\\ " + "\nTwo" + spans: Spans = parse_breaks(text) + assert_types(spans, (TextSpan,)) + assert_text(spans, [text]) + + +def test_simple_single_parse_pairs(): + """Test parsing for bold and italic marks""" + text: str + spans: Spans + + # Empty pair marks should parse + text = "****" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan]]) + + text = "////" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan]]) + + # Pair marks with text inside should parse + text = "**hello**" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, TextSpan]]) + assert_text(spans, [["hello"]]) + + text = "//hello//" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan, TextSpan]]) + assert_text(spans, [["hello"]]) + + # Text outside of pair marks should parse on the same level + text = "**hello** world" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, TextSpan], TextSpan]) + assert_text(spans, [["hello"], " world"]) + + text = "//hello// world" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan, TextSpan], TextSpan]) + assert_text(spans, [["hello"], " world"]) + + # Text before, between, and after pair marks should parse + text = "In the **beginning** was //the// Word" + spans = parse_paired_formatting(text) + assert_types( + spans, + [TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan], + ) + assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"]) + + +def test_simple_nested_parse_pairs(): + """Test parsing for nesting bold and italic""" + text: str + spans: Spans + + # Simple nested test cases + text = "**//hello//**" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]]) + assert_text(spans, [[["hello"]]]) + + text = "//**world**//" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]]) + assert_text(spans, [[["world"]]]) + + # Overlap should only parse the first + text = "**Hello//world**//" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, TextSpan], TextSpan]) + assert_text(spans, [["Hello//world"], "//"]) -- 2.44.1 From 6f380bd49565907536088be1963b5a60bee9194b Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 9 Jun 2021 20:14:32 -0700 Subject: [PATCH 5/8] Fix parsing pair marks with line breaks inside --- amanuensis/parser/parsing.py | 69 ++++++++++++++++++++++-------------- tests/test_parser.py | 26 ++++++++++++++ 2 files changed, 69 insertions(+), 26 deletions(-) diff --git a/amanuensis/parser/parsing.py b/amanuensis/parser/parsing.py index a16afae..e2d7b1c 100644 --- a/amanuensis/parser/parsing.py +++ b/amanuensis/parser/parsing.py @@ -47,32 +47,40 @@ def parse_paragraph(text: str) -> SpanContainer: def parse_paired_formatting( text: str, - can_cite: bool = True, - can_bold: bool = True, - can_italic: bool = True, + in_cite: bool = False, + in_bold: bool = False, + in_italic: bool = False, ) -> Spans: """ Parses citations, bolds, and italics, which can be nested inside each other. + A single type cannot nest inside itself, which is controlled by setting the + flag parameters to False. """ # Find positions of any paired formatting - first_cite = find_pair(text, "[[", "]]") if can_cite else -1 - first_bold = find_pair(text, "**", "**") if can_bold else -1 - first_italic = find_pair(text, "//", "//") if can_italic else -1 - # Load the possible parse handlers into the map + next_cite = find_pair(text, "[[", "]]") if not in_cite else -1 + next_bold = find_pair(text, "**", "**") if not in_bold else -1 + next_italic = find_pair(text, "//", "//") if not in_italic else -1 + # Create a map from a formatting mark's distance to its parse handler handlers = {} - handlers[first_cite] = lambda: parse_citation( - text, can_bold=can_bold, can_italic=can_italic + handlers[next_cite] = lambda: parse_citation( + text, in_bold=in_bold, in_italic=in_italic ) - handlers[first_bold] = lambda: parse_bold( - text, can_cite=can_cite, can_italic=can_italic + handlers[next_bold] = lambda: parse_bold( + text, in_cite=in_cite, in_italic=in_italic ) - handlers[first_italic] = lambda: parse_italic( - text, can_cite=can_cite, can_bold=can_bold + handlers[next_italic] = lambda: parse_italic( + text, in_cite=in_cite, in_bold=in_bold ) - # If nothing was found, move on to the next parsing step - handlers[-1] = lambda: parse_breaks(text) - # Choose a handler based on the earliest found result - finds = [i for i in (first_cite, first_bold, first_italic) if i > -1] + # Map the next parsing step at -1. If we're currently inside a formatting + # mark pair, skip parsing line breaks, which are not allowed inside paired + # marks. + if in_cite or in_bold or in_italic: + handlers[-1] = lambda: parse_text(text) + else: + handlers[-1] = lambda: parse_breaks(text) + # Choose the handler for the earliest found pair, or the default handler + # at -1 if nothing was found. + finds = [i for i in (next_cite, next_bold, next_italic) if i > -1] first = min(finds) if finds else -1 return handlers[first]() @@ -95,8 +103,8 @@ def find_pair(text: str, open_tag: str, close_tag: str) -> int: def parse_citation( text: str, - can_bold: bool = True, - can_italic: bool = True, + in_bold: bool = False, + in_italic: bool = False, ) -> Spans: """ Parses text into a citation span. @@ -118,7 +126,7 @@ def parse_citation( inner_split = text_inner.split("|", 1) text_inner_actual, cite_target = inner_split[0], inner_split[-1] spans_inner = parse_paired_formatting( - text_inner_actual, can_cite=False, can_bold=can_bold, can_italic=can_italic + text_inner_actual, in_cite=True, in_bold=in_bold, in_italic=in_italic ) citation = CitationSpan(spans_inner, cite_target) return [*spans_before, citation, *spans_after] @@ -128,8 +136,8 @@ def parse_citation( def parse_bold( text: str, - can_cite: bool = True, - can_italic: bool = True, + in_cite: bool = False, + in_italic: bool = False, ) -> Spans: """ Parses text into a bold span. @@ -144,7 +152,7 @@ def parse_bold( # Parse inner text minus bold parsing text_inner = text[bold_open + 2 : bold_close] spans_inner = parse_paired_formatting( - text_inner, can_cite=can_cite, can_bold=False, can_italic=can_italic + text_inner, in_cite=in_cite, in_bold=True, in_italic=in_italic ) bold = BoldSpan(spans_inner) return [*spans_before, bold, *spans_after] @@ -154,8 +162,8 @@ def parse_bold( def parse_italic( text: str, - can_cite: bool = True, - can_bold: bool = True, + in_cite: bool = False, + in_bold: bool = False, ) -> Spans: """ Parses text into an italic span. @@ -170,7 +178,7 @@ def parse_italic( # Parse inner text minus italic parsing text_inner = text[italic_open + 2 : italic_close] spans_inner = parse_paired_formatting( - text_inner, can_cite=can_cite, can_bold=can_bold, can_italic=False + text_inner, in_cite=in_cite, in_bold=in_bold, in_italic=True ) italic = ItalicSpan(spans_inner) return [*spans_before, italic, *spans_after] @@ -193,3 +201,12 @@ def parse_breaks(text: str) -> Spans: for i in range(0, 2 * len(splits) - 1) ] return spans + + +def parse_text(text: str) -> Spans: + """ + Parses text with no remaining parseable marks. + """ + if not text: + return [] + return [TextSpan(text)] diff --git a/tests/test_parser.py b/tests/test_parser.py index 5a27765..ec9ade8 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -153,6 +153,32 @@ def test_simple_single_parse_pairs(): assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"]) +def test_simple_parse_pairs_with_break(): + """Test pair marks with breaks""" + text: str + spans: Spans + + text = r"**glory\\" + "\nhammer**" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, TextSpan]]) + assert_text(spans, [["glory\\\\\nhammer"]]) + + text = r"//glory\\" + "\nhammer//" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan, TextSpan]]) + assert_text(spans, [["glory\\\\\nhammer"]]) + + text = r"**glory\\" + "\n**hammer**" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, TextSpan], TextSpan]) + assert_text(spans, [["glory\\\\\n"], "hammer**"]) + + text = r"//glory\\" + "\n//hammer//" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan, TextSpan], TextSpan]) + assert_text(spans, [["glory\\\\\n"], "hammer//"]) + + def test_simple_nested_parse_pairs(): """Test parsing for nesting bold and italic""" text: str -- 2.44.1 From 2c294f7f1284b48e0cb2969b9f72e0bf9d71cf7d Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 9 Jun 2021 21:32:29 -0700 Subject: [PATCH 6/8] Add citation parsing unit tests --- amanuensis/parser/parsing.py | 5 +- tests/test_parser.py | 123 ++++++++++++++++++++++++++++++++++- 2 files changed, 123 insertions(+), 5 deletions(-) diff --git a/amanuensis/parser/parsing.py b/amanuensis/parser/parsing.py index e2d7b1c..3dd08c8 100644 --- a/amanuensis/parser/parsing.py +++ b/amanuensis/parser/parsing.py @@ -121,8 +121,9 @@ def parse_citation( spans_after = parse_paired_formatting(text[cite_close + 2 :]) # Parse inner text and skip parsing for this format pair text_inner = text[cite_open + 2 : cite_close] - # For citations specifically, we may need to split off a citation - # target from the alias text + # For citations specifically, try to split off a citation target. + # If there's no citation target to split, use the same text as the + # citation text and the target. inner_split = text_inner.split("|", 1) text_inner_actual, cite_target = inner_split[0], inner_split[-1] spans_inner = parse_paired_formatting( diff --git a/tests/test_parser.py b/tests/test_parser.py index ec9ade8..269c815 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -107,7 +107,7 @@ def test_parse_breaks(): assert_text(spans, [text]) -def test_simple_single_parse_pairs(): +def test_parse_pairs_single(): """Test parsing for bold and italic marks""" text: str spans: Spans @@ -153,7 +153,7 @@ def test_simple_single_parse_pairs(): assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"]) -def test_simple_parse_pairs_with_break(): +def test_parse_pairs_break(): """Test pair marks with breaks""" text: str spans: Spans @@ -179,7 +179,7 @@ def test_simple_parse_pairs_with_break(): assert_text(spans, [["glory\\\\\n"], "hammer//"]) -def test_simple_nested_parse_pairs(): +def test_parse_pairs_nested(): """Test parsing for nesting bold and italic""" text: str spans: Spans @@ -200,3 +200,120 @@ def test_simple_nested_parse_pairs(): spans = parse_paired_formatting(text) assert_types(spans, [[BoldSpan, TextSpan], TextSpan]) assert_text(spans, [["Hello//world"], "//"]) + + +def test_normalize_title(): + """Test the title normalization used by the citation parser""" + nt = normalize_title + assert nt("hello") == "Hello" + assert nt(" world ") == "World" + assert nt("Waiting for Godot") == "Waiting for Godot" + assert nt("lowercase letters") == "Lowercase letters" + + +def test_parse_citation_single(): + """Test parsing citations, which have internal formatting""" + text: str + spans: Spans + + # Simple test cases + text = "[[hello]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan]]) + assert_text(spans, [["hello"]]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "Hello" + + text = "[[hello|world]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan]]) + assert_text(spans, [["hello"]]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "World" + + text = "[[hello||world]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan]]) + assert_text(spans, [["hello"]]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "|world" + + text = "[[ hello | world ]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan]]) + assert_text(spans, [[" hello "]]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "World" + + text = "[[faith|hope|love]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan]]) + assert_text(spans, [["faith"]]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "Hope|love" + + text = "[[ [[|]] ]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan], TextSpan]) + assert_text(spans, [[" [["], " ]]"]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "" + + +def test_parse_citation_break(): + """Test citations with breaks""" + text: str + spans: Spans + + text = "[[hello\\\\\nworld]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan]]) + assert_text(spans, [["hello\\\\\nworld"]]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "Hello\\\\ world" + + text = "[[one|two\\\\\nthree]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan]]) + assert_text(spans, [["one"]]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "Two\\\\ three" + + +def test_parse_citation_nested(): + """Test nesting with citations""" + text: str + spans: Spans + + text = "[[**hello world**]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, [BoldSpan, TextSpan]]]) + assert_text(spans, [[["hello world"]]]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "**hello world**" + + text = "[[**hello|world**]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan]]) + assert_text(spans, [["**hello"]]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "World**" + + text = "**[[hello world]]**" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, [CitationSpan, TextSpan]]]) + assert_text(spans, [[["hello world"]]]) + citation: CitationSpan = spans[0].spans[0] + assert citation.cite_target == "Hello world" + + text = "**[[hello world**]]" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, TextSpan], TextSpan]) + assert_text(spans, [["[[hello world"], "]]"]) + + text = "[[**hello world]]**" + spans = parse_paired_formatting(text) + assert_types(spans, [[CitationSpan, TextSpan], TextSpan]) + assert_text(spans, [["**hello world"], "**"]) + citation: CitationSpan = spans[0] + assert citation.cite_target == "**hello world" -- 2.44.1 From fde12ac818368e195b2f1faafb9029d0964acae7 Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Fri, 11 Jun 2021 23:26:28 -0700 Subject: [PATCH 7/8] Add paragraph and article parser tests --- amanuensis/parser/parsing.py | 8 +--- tests/test_parser.py | 77 ++++++++++++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 13 deletions(-) diff --git a/amanuensis/parser/parsing.py b/amanuensis/parser/parsing.py index 3dd08c8..da800e8 100644 --- a/amanuensis/parser/parsing.py +++ b/amanuensis/parser/parsing.py @@ -65,12 +65,8 @@ def parse_paired_formatting( handlers[next_cite] = lambda: parse_citation( text, in_bold=in_bold, in_italic=in_italic ) - handlers[next_bold] = lambda: parse_bold( - text, in_cite=in_cite, in_italic=in_italic - ) - handlers[next_italic] = lambda: parse_italic( - text, in_cite=in_cite, in_bold=in_bold - ) + handlers[next_bold] = lambda: parse_bold(text, in_cite=in_cite, in_italic=in_italic) + handlers[next_italic] = lambda: parse_italic(text, in_cite=in_cite, in_bold=in_bold) # Map the next parsing step at -1. If we're currently inside a formatting # mark pair, skip parsing line breaks, which are not allowed inside paired # marks. diff --git a/tests/test_parser.py b/tests/test_parser.py index 269c815..6be240b 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -25,18 +25,18 @@ from amanuensis.parser.parsing import ( def assert_types(spans: Spans, types: Sequence, loc=None): """ - Asserts that a span list has the types specified. + Asserts that a span list has the types specified. Each element in `types` should be either a span type or a list. The first element of the list is the container type and the remaining elements are the content types. """ - assert len(spans) == len( - types - ), f"Unexpected type sequence length at loc {loc if loc else 'root'}" - i = -1 - for span, span_type in zip(spans, types): - i += 1 + for i in range(max(len(spans), len(types))): i_loc = f"{loc}.{i}" if loc else f"{i}" + # Check lengths are equal + assert i < len(spans), f"Span list unexpectedly short at {i_loc}" + assert i < len(types), f"Type list unexpectedly short at {i_loc}" + # Check types are equal + span, span_type = spans[i], types[i] if isinstance(span_type, list): assert isinstance( span, SpanContainer @@ -317,3 +317,66 @@ def test_parse_citation_nested(): assert_text(spans, [["**hello world"], "**"]) citation: CitationSpan = spans[0] assert citation.cite_target == "**hello world" + + +def test_parse_paragraphs(): + """Test parsing paragraphs""" + para: str + span: SpanContainer + + # Body paragraph + para = "\tIn the beginning was the Word." + span = parse_paragraph(para) + assert_types([span], [[BodyParagraph, TextSpan]]) + assert_text([span], [["In the beginning was the Word."]]) + + # Signature paragraph + para = "~Ersatz Scrivener, scholar extraordinaire" + span = parse_paragraph(para) + assert_types([span], [[SignatureParagraph, TextSpan]]) + assert_text([span], [["Ersatz Scrivener, scholar extraordinaire"]]) + + +def test_parse_article(): + """Test the full article parser""" + article: str = ( + "Writing a **unit test** requires having test //content//.\n\n" + "This content, of course, must be [[created|Writing test collateral]].\n\n" + "~Bucky, unit test writer" + ) + parsed: ParsedArticle = parse_raw_markdown(article) + + assert_types( + [parsed], + [ + [ + ParsedArticle, + [ + BodyParagraph, + TextSpan, + [BoldSpan, TextSpan], + TextSpan, + [ItalicSpan, TextSpan], + TextSpan, + ], + [BodyParagraph, TextSpan, [CitationSpan, TextSpan], TextSpan], + [SignatureParagraph, TextSpan], + ] + ], + ) + assert_text( + [parsed], + [ + [ + [ + "Writing a ", + ["unit test"], + " requires having test ", + ["content"], + ".", + ], + ["This content, of course, must be ", ["created"], "."], + ["Bucky, unit test writer"], + ] + ], + ) -- 2.44.1 From 00738b5a45b358d7cad4833ba8a5df17a467ac15 Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Fri, 11 Jun 2021 23:45:28 -0700 Subject: [PATCH 8/8] Add renderable visitor unit test --- tests/test_parser.py | 87 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index 6be240b..3409cb1 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -342,7 +342,7 @@ def test_parse_article(): article: str = ( "Writing a **unit test** requires having test //content//.\n\n" "This content, of course, must be [[created|Writing test collateral]].\n\n" - "~Bucky, unit test writer" + "~Bucky\\\\\nUnit test writer" ) parsed: ParsedArticle = parse_raw_markdown(article) @@ -360,7 +360,7 @@ def test_parse_article(): TextSpan, ], [BodyParagraph, TextSpan, [CitationSpan, TextSpan], TextSpan], - [SignatureParagraph, TextSpan], + [SignatureParagraph, TextSpan, LineBreak, TextSpan], ] ], ) @@ -376,7 +376,88 @@ def test_parse_article(): ".", ], ["This content, of course, must be ", ["created"], "."], - ["Bucky, unit test writer"], + ["Bucky", None, "Unit test writer"], ] ], ) + + +def test_visitor(): + """Test that a visitor dispatches to hooks correctly""" + + class TestVisitor(RenderableVisitor): + def __init__(self): + self.visited = [] + + def TextSpan(self, span: TextSpan): + assert isinstance(span, TextSpan) + self.visited.append(span) + + def LineBreak(self, span: LineBreak): + assert isinstance(span, LineBreak) + self.visited.append(span) + + def ParsedArticle(self, span: ParsedArticle): + assert isinstance(span, ParsedArticle) + self.visited.append(span) + span.recurse(self) + + def BodyParagraph(self, span: BodyParagraph): + assert isinstance(span, BodyParagraph) + self.visited.append(span) + span.recurse(self) + + def SignatureParagraph(self, span: SignatureParagraph): + assert isinstance(span, SignatureParagraph) + self.visited.append(span) + span.recurse(self) + + def BoldSpan(self, span: BoldSpan): + assert isinstance(span, BoldSpan) + self.visited.append(span) + span.recurse(self) + + def ItalicSpan(self, span: ItalicSpan): + assert isinstance(span, ItalicSpan) + self.visited.append(span) + span.recurse(self) + + def CitationSpan(self, span: CitationSpan): + assert isinstance(span, CitationSpan) + self.visited.append(span) + span.recurse(self) + + article: str = ( + "Writing a **unit test** requires having test //content//.\n\n" + "This content, of course, must be [[created|Writing test collateral]].\n\n" + "~Bucky\\\\\nUnit test writer" + ) + parsed: ParsedArticle = parse_raw_markdown(article) + + visitor = TestVisitor() + # All the typecheck asserts pass + parsed.render(visitor) + # The test article should parse into these spans and visit in this (arbitrary) order + type_order = [ + ParsedArticle, + BodyParagraph, + TextSpan, + BoldSpan, + TextSpan, + TextSpan, + ItalicSpan, + TextSpan, + TextSpan, + BodyParagraph, + TextSpan, + CitationSpan, + TextSpan, + TextSpan, + SignatureParagraph, + TextSpan, + LineBreak, + TextSpan, + ] + assert len(visitor.visited) == len(type_order) + for span, type in zip(visitor.visited, type_order): + assert isinstance(span, type) -- 2.44.1