Incorporate parser into new code #12

Merged
Jaculabilis merged 8 commits from tvb/parser into develop 2021-06-12 17:28:19 +00:00
13 changed files with 920 additions and 405 deletions

View File

@ -9,11 +9,94 @@ from amanuensis.config import ReadOnlyOrderedDict
from amanuensis.models import LexiconModel, UserModel from amanuensis.models import LexiconModel, UserModel
from amanuensis.parser import ( from amanuensis.parser import (
parse_raw_markdown, parse_raw_markdown,
GetCitations,
HtmlRenderer,
titlesort, titlesort,
filesafe_title, filesafe_title)
ConstraintAnalysis) from amanuensis.parser.core import RenderableVisitor
class GetCitations(RenderableVisitor):
def __init__(self):
self.citations = []
def ParsedArticle(self, span):
span.recurse(self)
return self.citations
def CitationSpan(self, span):
self.citations.append(span.cite_target)
return self
class ConstraintAnalysis(RenderableVisitor):
def __init__(self, lexicon: LexiconModel):
self.info: List[str] = []
self.warning: List[str] = []
self.error: List[str] = []
self.word_count: int = 0
self.citations: list = []
self.signatures: int = 0
def TextSpan(self, span):
self.word_count += len(re.split(r'\s+', span.innertext.strip()))
return self
def SignatureParagraph(self, span):
self.signatures += 1
span.recurse(self)
return self
def CitationSpan(self, span):
self.citations.append(span.cite_target)
span.recurse(self)
return self
class HtmlRenderer(RenderableVisitor):
"""
Renders an article token tree into published article HTML.
"""
def __init__(self, lexicon_name: str, written_articles: Iterable[str]):
self.lexicon_name: str = lexicon_name
self.written_articles: Iterable[str] = written_articles
def TextSpan(self, span):
return span.innertext
def LineBreak(self, span):
return '<br>'
def ParsedArticle(self, span):
return '\n'.join(span.recurse(self))
def BodyParagraph(self, span):
return f'<p>{"".join(span.recurse(self))}</p>'
def SignatureParagraph(self, span):
return (
'<hr><span class="signature"><p>'
f'{"".join(span.recurse(self))}'
'</p></span>'
)
def BoldSpan(self, span):
return f'<b>{"".join(span.recurse(self))}</b>'
def ItalicSpan(self, span):
return f'<i>{"".join(span.recurse(self))}</i>'
def CitationSpan(self, span):
if span.cite_target in self.written_articles:
link_class = ''
else:
link_class = ' class="phantom"'
# link = url_for(
# 'lexicon.article',
# name=self.lexicon_name,
# title=filesafe_title(span.cite_target))
link = (f'/lexicon/{self.lexicon_name}'
+ f'/article/{filesafe_title(span.cite_target)}')
return f'<a href="{link}"{link_class}>{"".join(span.recurse(self))}</a>'
def get_player_characters( def get_player_characters(

View File

@ -13,7 +13,7 @@
# from amanuensis.config.loader import AttrOrderedDict # from amanuensis.config.loader import AttrOrderedDict
# from amanuensis.errors import ArgumentError # from amanuensis.errors import ArgumentError
# from amanuensis.lexicon import LexiconModel # from amanuensis.lexicon import LexiconModel
# from amanuensis.parser import parse_raw_markdown, GetCitations, HtmlRenderer, filesafe_title, titlesort # from amanuensis.parser import parse_raw_markdown, filesafe_title, titlesort
# from amanuensis.resources import get_stream # from amanuensis.resources import get_stream

View File

@ -2,19 +2,14 @@
Module encapsulating all markdown parsing functionality. Module encapsulating all markdown parsing functionality.
""" """
from .analyze import ConstraintAnalysis, GetCitations from .core import RenderableVisitor
from .core import normalize_title from .helpers import normalize_title, filesafe_title, titlesort
from .helpers import titlesort, filesafe_title
from .parsing import parse_raw_markdown from .parsing import parse_raw_markdown
from .render import PreviewHtmlRenderer, HtmlRenderer
__all__ = [ __all__ = [
ConstraintAnalysis.__name__, "RenderableVisitor",
GetCitations.__name__, "normalize_title",
normalize_title.__name__, "filesafe_title",
titlesort.__name__, "titlesort",
filesafe_title.__name__, "parse_raw_markdown",
parse_raw_markdown.__name__,
PreviewHtmlRenderer.__name__,
HtmlRenderer.__name__,
] ]

View File

@ -1,49 +0,0 @@
"""
Internal module encapsulating visitors that compute metrics on articles
for verification against constraints.
"""
import re
from typing import List
from amanuensis.models import LexiconModel
from .core import RenderableVisitor
class GetCitations(RenderableVisitor):
def __init__(self):
self.citations = []
def ParsedArticle(self, span):
span.recurse(self)
return self.citations
def CitationSpan(self, span):
self.citations.append(span.cite_target)
return self
class ConstraintAnalysis(RenderableVisitor):
def __init__(self, lexicon: LexiconModel):
self.info: List[str] = []
self.warning: List[str] = []
self.error: List[str] = []
self.word_count: int = 0
self.citations: list = []
self.signatures: int = 0
def TextSpan(self, span):
self.word_count += len(re.split(r'\s+', span.innertext.strip()))
return self
def SignatureParagraph(self, span):
self.signatures += 1
span.recurse(self)
return self
def CitationSpan(self, span):
self.citations.append(span.cite_target)
span.recurse(self)
return self

View File

@ -5,32 +5,26 @@ which can be operated on by a visitor defining functions that hook off
of the different token types. of the different token types.
""" """
import re
from typing import Callable, Any, Sequence from typing import Callable, Any, Sequence
RenderHook = Callable[['Renderable'], Any] from .helpers import normalize_title
Spans = Sequence['Renderable']
def normalize_title(title: str) -> str: RenderHook = Callable[["Renderable"], Any]
""" Spans = Sequence["Renderable"]
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r'\s+', " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
class Renderable(): class Renderable:
""" """
Base class for parsed markdown. Provides the `render()` method for Base class for parsed markdown. Provides the `render()` method for
visiting the token tree. visiting the token tree.
""" """
def render(self: 'Renderable', renderer: 'RenderableVisitor'):
def render(self: "Renderable", renderer: "RenderableVisitor"):
""" """
Execute the apppropriate visitor method on this Renderable. Execute the apppropriate visitor method on this Renderable.
Visitors implement hooks by declaring methods whose names are
the name of a Renderable class.
""" """
hook: RenderHook = getattr(renderer, type(self).__name__, None) hook: RenderHook = getattr(renderer, type(self).__name__, None)
if hook: if hook:
@ -39,30 +33,35 @@ class Renderable():
class TextSpan(Renderable): class TextSpan(Renderable):
"""An unstyled length of text.""" """A length of text."""
def __init__(self, innertext: str): def __init__(self, innertext: str):
self.innertext = innertext self.innertext = innertext
def __str__(self): def __repr__(self):
return f"[{self.innertext}]" return f"<{self.innertext}>"
class LineBreak(Renderable): class LineBreak(Renderable):
"""A line break within a paragraph.""" """A line break within a paragraph."""
def __str__(self):
def __repr__(self):
return "<break>" return "<break>"
class SpanContainer(Renderable): class SpanContainer(Renderable):
"""A formatting element that wraps some amount of text.""" """A formatting element that wraps some amount of text."""
def __init__(self, spans: Spans): def __init__(self, spans: Spans):
self.spans: Spans = spans self.spans: Spans = spans
def __str__(self): def __repr__(self):
return (f'[{type(self).__name__} ' return (
+ f'{" ".join([str(span) for span in self.spans])}]') f"<{type(self).__name__} "
+ f'{" ".join([repr(span) for span in self.spans])}>'
)
def recurse(self, renderer: 'RenderableVisitor'): def recurse(self, renderer: "RenderableVisitor"):
return [child.render(renderer) for child in self.spans] return [child.render(renderer) for child in self.spans]
@ -88,22 +87,26 @@ class ItalicSpan(SpanContainer):
class CitationSpan(SpanContainer): class CitationSpan(SpanContainer):
"""A citation to another article.""" """A citation to another article."""
def __init__(self, spans: Spans, cite_target: str): def __init__(self, spans: Spans, cite_target: str):
super().__init__(spans) super().__init__(spans)
# Normalize citation target on parse, since we don't want # Normalize citation target on parse, since we don't want
# abnormal title strings lying around causing trouble. # abnormal title strings lying around causing trouble.
self.cite_target: str = normalize_title(cite_target) self.cite_target: str = normalize_title(cite_target)
def __str__(self): def __repr__(self) -> str:
return (f'{{{" ".join([str(span) for span in self.spans])}' return (
+ f':{self.cite_target}}}') f'{{{" ".join([repr(span) for span in self.spans])}'
+ f":{self.cite_target}}}"
)
class RenderableVisitor(): class RenderableVisitor:
""" """
Default implementation of the visitor pattern. Executes once on Default implementation of the visitor pattern. Executes once on
each token in the tree and returns itself. each token in the tree and returns itself.
""" """
def TextSpan(self, span: TextSpan): def TextSpan(self, span: TextSpan):
return self return self

View File

@ -1,7 +1,22 @@
"""
Helper functions for manipulating titles during parsing
"""
import re import re
import urllib.parse import urllib.parse
def normalize_title(title: str) -> str:
"""
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r"\s+", " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
def titlesort(title: str) -> str: def titlesort(title: str) -> str:
""" """
Strips articles off of titles for alphabetical sorting purposes Strips articles off of titles for alphabetical sorting purposes
@ -20,9 +35,19 @@ def filesafe_title(title: str) -> str:
""" """
Makes an article title filename-safe. Makes an article title filename-safe.
""" """
s = re.sub(r"\s+", '_', title) # Replace whitespace with _ # Replace whitespace with _
s = re.sub(r"~", '-', s) # parse.quote doesn't catch ~ s = re.sub(r"\s+", "_", title)
s = urllib.parse.quote(s) # Encode all other characters
s = re.sub(r"%", "", s) # Strip encoding %s # parse.quote doesn't catch ~
s = s[:64] # Limit to 64 characters s = re.sub(r"~", "-", s)
# Encode all other characters
s = urllib.parse.quote(s)
# Strip encoding %s
s = re.sub(r"%", "", s)
# Limit to 64 characters
s = s[:64]
return s return s

View File

@ -16,7 +16,7 @@ from .core import (
ItalicSpan, ItalicSpan,
CitationSpan, CitationSpan,
Renderable, Renderable,
SpanContainer SpanContainer,
) )
Spans = Sequence[Renderable] Spans = Sequence[Renderable]
@ -28,15 +28,18 @@ def parse_raw_markdown(text: str) -> ParsedArticle:
""" """
# Parse each paragraph individually, as no formatting applies # Parse each paragraph individually, as no formatting applies
# across paragraphs # across paragraphs
paragraphs = re.split(r'\n\n+', text) paragraphs = re.split(r"\n\n+", text)
parse_results = list(map(parse_paragraph, paragraphs)) parse_results = list(map(parse_paragraph, paragraphs))
return ParsedArticle(parse_results) return ParsedArticle(parse_results)
def parse_paragraph(text: str) -> SpanContainer: def parse_paragraph(text: str) -> SpanContainer:
"""
Parses a block of text into a paragraph object.
"""
# Parse the paragraph as a span of text # Parse the paragraph as a span of text
text = text.strip() text = text.strip()
if text and text[0] == '~': if text and text[0] == "~":
return SignatureParagraph(parse_paired_formatting(text[1:])) return SignatureParagraph(parse_paired_formatting(text[1:]))
else: else:
return BodyParagraph(parse_paired_formatting(text)) return BodyParagraph(parse_paired_formatting(text))
@ -44,34 +47,44 @@ def parse_paragraph(text: str) -> SpanContainer:
def parse_paired_formatting( def parse_paired_formatting(
text: str, text: str,
cite: bool = True, in_cite: bool = False,
bold: bool = True, in_bold: bool = False,
italic: bool = True) -> Spans: in_italic: bool = False,
) -> Spans:
"""
Parses citations, bolds, and italics, which can be nested inside each other.
A single type cannot nest inside itself, which is controlled by setting the
flag parameters to False.
"""
# Find positions of any paired formatting # Find positions of any paired formatting
first_cite = find_pair(text, "[[", "]]", cite) next_cite = find_pair(text, "[[", "]]") if not in_cite else -1
first_bold = find_pair(text, "**", "**", bold) next_bold = find_pair(text, "**", "**") if not in_bold else -1
first_italic = find_pair(text, "//", "//", italic) next_italic = find_pair(text, "//", "//") if not in_italic else -1
# Load the possible parse handlers into the map # Create a map from a formatting mark's distance to its parse handler
handlers = {} handlers = {}
handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic) handlers[next_cite] = lambda: parse_citation(
handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic) text, in_bold=in_bold, in_italic=in_italic
handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold) )
# If nothing was found, move on to the next parsing step handlers[next_bold] = lambda: parse_bold(text, in_cite=in_cite, in_italic=in_italic)
handlers[next_italic] = lambda: parse_italic(text, in_cite=in_cite, in_bold=in_bold)
# Map the next parsing step at -1. If we're currently inside a formatting
# mark pair, skip parsing line breaks, which are not allowed inside paired
# marks.
if in_cite or in_bold or in_italic:
handlers[-1] = lambda: parse_text(text)
else:
handlers[-1] = lambda: parse_breaks(text) handlers[-1] = lambda: parse_breaks(text)
# Choose a handler based on the earliest found result # Choose the handler for the earliest found pair, or the default handler
finds = [i for i in (first_cite, first_bold, first_italic) if i > -1] # at -1 if nothing was found.
finds = [i for i in (next_cite, next_bold, next_italic) if i > -1]
first = min(finds) if finds else -1 first = min(finds) if finds else -1
return handlers[first]() return handlers[first]()
def find_pair( def find_pair(text: str, open_tag: str, close_tag: str) -> int:
text: str, """
open_tag: str, Finds the beginning of a pair of formatting marks.
close_tag: str, """
valid: bool) -> int:
# If skipping, return -1
if not valid:
return -1
# If the open tag wasn't found, return -1 # If the open tag wasn't found, return -1
first = text.find(open_tag) first = text.find(open_tag)
if first < 0: if first < 0:
@ -84,7 +97,14 @@ def find_pair(
return first return first
def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans: def parse_citation(
text: str,
in_bold: bool = False,
in_italic: bool = False,
) -> Spans:
"""
Parses text into a citation span.
"""
cite_open = text.find("[[") cite_open = text.find("[[")
if cite_open > -1: if cite_open > -1:
cite_close = text.find("]]", cite_open + 2) cite_close = text.find("]]", cite_open + 2)
@ -94,51 +114,69 @@ def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans:
spans_before = parse_breaks(text[:cite_open]) spans_before = parse_breaks(text[:cite_open])
# Continue parsing pair formatting after this one closes with all # Continue parsing pair formatting after this one closes with all
# three as valid choices # three as valid choices
spans_after = parse_paired_formatting(text[cite_close + 2:]) spans_after = parse_paired_formatting(text[cite_close + 2 :])
# Parse inner text and skip parsing for this format pair # Parse inner text and skip parsing for this format pair
text_inner = text[cite_open + 2:cite_close] text_inner = text[cite_open + 2 : cite_close]
# For citations specifically, we may need to split off a citation # For citations specifically, try to split off a citation target.
# target from the alias text # If there's no citation target to split, use the same text as the
# citation text and the target.
inner_split = text_inner.split("|", 1) inner_split = text_inner.split("|", 1)
text_inner_actual, cite_target = inner_split[0], inner_split[-1] text_inner_actual, cite_target = inner_split[0], inner_split[-1]
spans_inner = parse_paired_formatting(text_inner_actual, spans_inner = parse_paired_formatting(
cite=False, bold=bold, italic=italic) text_inner_actual, in_cite=True, in_bold=in_bold, in_italic=in_italic
)
citation = CitationSpan(spans_inner, cite_target) citation = CitationSpan(spans_inner, cite_target)
return [*spans_before, citation, *spans_after] return [*spans_before, citation, *spans_after]
# Should never happen # Should never happen
return parse_breaks(text) return parse_breaks(text)
def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans: def parse_bold(
text: str,
in_cite: bool = False,
in_italic: bool = False,
) -> Spans:
"""
Parses text into a bold span.
"""
bold_open = text.find("**") bold_open = text.find("**")
if bold_open > -1: if bold_open > -1:
bold_close = text.find("**", bold_open + 2) bold_close = text.find("**", bold_open + 2)
# Should be no formatting behind us # Should be no formatting behind us
spans_before = parse_breaks(text[:bold_open]) spans_before = parse_breaks(text[:bold_open])
# Freely parse formatting after us # Freely parse formatting after us
spans_after = parse_paired_formatting(text[bold_close + 2:]) spans_after = parse_paired_formatting(text[bold_close + 2 :])
# Parse inner text minus bold parsing # Parse inner text minus bold parsing
text_inner = text[bold_open + 2:bold_close] text_inner = text[bold_open + 2 : bold_close]
spans_inner = parse_paired_formatting(text_inner, spans_inner = parse_paired_formatting(
cite=cite, bold=False, italic=italic) text_inner, in_cite=in_cite, in_bold=True, in_italic=in_italic
)
bold = BoldSpan(spans_inner) bold = BoldSpan(spans_inner)
return [*spans_before, bold, *spans_after] return [*spans_before, bold, *spans_after]
# Should never happen # Should never happen
return parse_italic(text) return parse_italic(text)
def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans: def parse_italic(
text: str,
in_cite: bool = False,
in_bold: bool = False,
) -> Spans:
"""
Parses text into an italic span.
"""
italic_open = text.find("//") italic_open = text.find("//")
if italic_open > -1: if italic_open > -1:
italic_close = text.find("//", italic_open + 2) italic_close = text.find("//", italic_open + 2)
# Should be no formatting behind us # Should be no formatting behind us
spans_before = parse_breaks(text[:italic_open]) spans_before = parse_breaks(text[:italic_open])
# Freely parse formatting after us # Freely parse formatting after us
spans_after = parse_paired_formatting(text[italic_close + 2:]) spans_after = parse_paired_formatting(text[italic_close + 2 :])
# Parse inner text minus italic parsing # Parse inner text minus italic parsing
text_inner = text[italic_open + 2:italic_close] text_inner = text[italic_open + 2 : italic_close]
spans_inner = parse_paired_formatting(text_inner, spans_inner = parse_paired_formatting(
cite=cite, bold=bold, italic=False) text_inner, in_cite=in_cite, in_bold=in_bold, in_italic=True
)
italic = ItalicSpan(spans_inner) italic = ItalicSpan(spans_inner)
return [*spans_before, italic, *spans_after] return [*spans_before, italic, *spans_after]
# Should never happen # Should never happen
@ -146,11 +184,26 @@ def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans:
def parse_breaks(text: str) -> Spans: def parse_breaks(text: str) -> Spans:
"""
Parses intra-paragraph line breaks.
"""
# Parse empty text into nothing
if not text: if not text:
return [] return []
# Split on the line break mark appearing at the end of the line
splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
# Put a LineBreak between each TextSpan
spans: Spans = [ spans: Spans = [
splits[i // 2] if i % 2 == 0 else LineBreak() splits[i // 2] if i % 2 == 0 else LineBreak()
for i in range(0, 2 * len(splits) - 1) for i in range(0, 2 * len(splits) - 1)
] ]
return spans return spans
def parse_text(text: str) -> Spans:
"""
Parses text with no remaining parseable marks.
"""
if not text:
return []
return [TextSpan(text)]

View File

@ -1,104 +0,0 @@
"""
Internal module encapsulating visitors that render articles into
readable formats.
"""
from typing import Iterable
from .core import RenderableVisitor
from .helpers import filesafe_title
class HtmlRenderer(RenderableVisitor):
"""
Renders an article token tree into published article HTML.
"""
def __init__(self, lexicon_name: str, written_articles: Iterable[str]):
self.lexicon_name: str = lexicon_name
self.written_articles: Iterable[str] = written_articles
def TextSpan(self, span):
return span.innertext
def LineBreak(self, span):
return '<br>'
def ParsedArticle(self, span):
return '\n'.join(span.recurse(self))
def BodyParagraph(self, span):
return f'<p>{"".join(span.recurse(self))}</p>'
def SignatureParagraph(self, span):
return (
'<hr><span class="signature"><p>'
f'{"".join(span.recurse(self))}'
'</p></span>'
)
def BoldSpan(self, span):
return f'<b>{"".join(span.recurse(self))}</b>'
def ItalicSpan(self, span):
return f'<i>{"".join(span.recurse(self))}</i>'
def CitationSpan(self, span):
if span.cite_target in self.written_articles:
link_class = ''
else:
link_class = ' class="phantom"'
# link = url_for(
# 'lexicon.article',
# name=self.lexicon_name,
# title=filesafe_title(span.cite_target))
link = (f'/lexicon/{self.lexicon_name}'
+ f'/article/{filesafe_title(span.cite_target)}')
return f'<a href="{link}"{link_class}>{"".join(span.recurse(self))}</a>'
class PreviewHtmlRenderer(RenderableVisitor):
def __init__(self, lexicon):
with lexicon.ctx.read('info') as info:
self.article_map = {
title: article.character
for title, article in info.items()
}
self.citations = []
self.contents = ""
def TextSpan(self, span):
return span.innertext
def LineBreak(self, span):
return '<br>'
def ParsedArticle(self, span):
self.contents = '\n'.join(span.recurse(self))
return self
def BodyParagraph(self, span):
return f'<p>{"".join(span.recurse(self))}</p>'
def SignatureParagraph(self, span):
return (
'<hr><span class="signature"><p>'
f'{"".join(span.recurse(self))}'
'</p></span>'
)
def BoldSpan(self, span):
return f'<b>{"".join(span.recurse(self))}</b>'
def ItalicSpan(self, span):
return f'<i>{"".join(span.recurse(self))}</i>'
def CitationSpan(self, span):
if span.cite_target in self.article_map:
if self.article_map.get(span.cite_target):
link_class = '[extant]'
else:
link_class = '[phantom]'
else:
link_class = '[new]'
self.citations.append(f'{span.cite_target} {link_class}')
return f'<u>{"".join(span.recurse(self))}</u>[{len(self.citations)}]'

View File

@ -15,9 +15,7 @@ from amanuensis.lexicon import (
create_character_in_lexicon, create_character_in_lexicon,
get_draft) get_draft)
from amanuensis.models import LexiconModel from amanuensis.models import LexiconModel
from amanuensis.parser import ( from amanuensis.parser import parse_raw_markdown
parse_raw_markdown,
PreviewHtmlRenderer)
from amanuensis.server.helpers import ( from amanuensis.server.helpers import (
lexicon_param, lexicon_param,
player_required, player_required,
@ -29,7 +27,7 @@ from .forms import (
LexiconPublishTurnForm, LexiconPublishTurnForm,
LexiconConfigForm) LexiconConfigForm)
from .editor import load_editor, new_draft, update_draft from .editor import load_editor, new_draft, update_draft, PreviewHtmlRenderer
bp_session = Blueprint('session', __name__, bp_session = Blueprint('session', __name__,

View File

@ -17,8 +17,56 @@ from amanuensis.lexicon import (
from amanuensis.models import LexiconModel from amanuensis.models import LexiconModel
from amanuensis.parser import ( from amanuensis.parser import (
normalize_title, normalize_title,
parse_raw_markdown, parse_raw_markdown)
PreviewHtmlRenderer) from amanuensis.parser.core import RenderableVisitor
class PreviewHtmlRenderer(RenderableVisitor):
def __init__(self, lexicon):
with lexicon.ctx.read('info') as info:
self.article_map = {
title: article.character
for title, article in info.items()
}
self.citations = []
self.contents = ""
def TextSpan(self, span):
return span.innertext
def LineBreak(self, span):
return '<br>'
def ParsedArticle(self, span):
self.contents = '\n'.join(span.recurse(self))
return self
def BodyParagraph(self, span):
return f'<p>{"".join(span.recurse(self))}</p>'
def SignatureParagraph(self, span):
return (
'<hr><span class="signature"><p>'
f'{"".join(span.recurse(self))}'
'</p></span>'
)
def BoldSpan(self, span):
return f'<b>{"".join(span.recurse(self))}</b>'
def ItalicSpan(self, span):
return f'<i>{"".join(span.recurse(self))}</i>'
def CitationSpan(self, span):
if span.cite_target in self.article_map:
if self.article_map.get(span.cite_target):
link_class = '[extant]'
else:
link_class = '[phantom]'
else:
link_class = '[new]'
self.citations.append(f'{span.cite_target} {link_class}')
return f'<u>{"".join(span.recurse(self))}</u>[{len(self.citations)}]'
def load_editor(lexicon: LexiconModel, aid: str): def load_editor(lexicon: LexiconModel, aid: str):

View File

@ -1,4 +1,4 @@
[mypy] [mypy]
ignore_missing_imports = true ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
; mypy stable doesn't support pyproject.toml yet ; mypy stable doesn't support pyproject.toml yet

View File

@ -17,11 +17,11 @@ black = "^21.5b2"
mypy = "^0.812" mypy = "^0.812"
[tool.black] [tool.black]
extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/parser/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py" extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
[tool.mypy] [tool.mypy]
ignore_missing_imports = true ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
[tool.pytest.ini_options] [tool.pytest.ini_options]
addopts = "--show-capture=log" addopts = "--show-capture=log"

463
tests/test_parser.py Normal file
View File

@ -0,0 +1,463 @@
from typing import Sequence
from amanuensis.parser.core import (
TextSpan,
LineBreak,
ParsedArticle,
BodyParagraph,
SignatureParagraph,
BoldSpan,
ItalicSpan,
CitationSpan,
Renderable,
SpanContainer,
RenderableVisitor,
Spans,
)
from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort
from amanuensis.parser.parsing import (
parse_breaks,
parse_paired_formatting,
parse_paragraph,
parse_raw_markdown,
)
def assert_types(spans: Spans, types: Sequence, loc=None):
"""
Asserts that a span list has the types specified.
Each element in `types` should be either a span type or a list. The first
element of the list is the container type and the remaining elements are the
content types.
"""
for i in range(max(len(spans), len(types))):
i_loc = f"{loc}.{i}" if loc else f"{i}"
# Check lengths are equal
assert i < len(spans), f"Span list unexpectedly short at {i_loc}"
assert i < len(types), f"Type list unexpectedly short at {i_loc}"
# Check types are equal
span, span_type = spans[i], types[i]
if isinstance(span_type, list):
assert isinstance(
span, SpanContainer
), f"Expected a span container at loc {i_loc}"
assert (
len(span.spans) == len(span_type) - 1
), f"Unexpected container size at loc {i_loc}"
assert isinstance(
span, span_type[0]
), f"Unexpected container type at loc {i_loc}"
assert_types(span.spans, span_type[1:], loc=i_loc)
else:
assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}"
assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}"
def assert_text(spans: Spans, texts: Sequence, loc=None):
"""
Asserts that a span list has the inner text structure specified.
Each element in `texts` should be either a string or a list of the same.
"""
assert len(spans) == len(
texts
), f"Unexpected text sequence length at loc {loc if loc else 'root'}"
i = -1
for span, text in zip(spans, texts):
i += 1
i_loc = f"{loc}.{i}" if loc else f"{i}"
if isinstance(text, str):
assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}"
assert span.innertext == text, f"Unexpected text at loc {i_loc}"
elif isinstance(text, list):
assert isinstance(
span, SpanContainer
), f"Expected a span container at loc {i_loc}"
assert_text(span.spans, text, loc=i_loc)
else:
assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}"
def test_parse_breaks():
"""Test parsing for intra-pragraph line break"""
text: str
spans: Spans
# Only having a line break does nothing
text = "One\nTwo"
spans: Spans = parse_breaks(text)
assert_types(spans, [TextSpan])
assert_text(spans, [text])
# Having the mark causes the text to be split across it
text = r"One\\" + "\nTwo"
spans: Spans = parse_breaks(text)
assert_types(spans, [TextSpan, LineBreak, TextSpan])
assert_text(spans, ["One", None, "Two"])
# Multiple lines can be broken
text = r"One\\" + "\n" + r"Two\\" + "\nThree"
spans: Spans = parse_breaks(text)
assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan])
assert_text(spans, ["One", None, "Two", None, "Three"])
# The mark must be at the end of the line
text = r"One\\ " + "\nTwo"
spans: Spans = parse_breaks(text)
assert_types(spans, (TextSpan,))
assert_text(spans, [text])
def test_parse_pairs_single():
"""Test parsing for bold and italic marks"""
text: str
spans: Spans
# Empty pair marks should parse
text = "****"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan]])
text = "////"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan]])
# Pair marks with text inside should parse
text = "**hello**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan]])
assert_text(spans, [["hello"]])
text = "//hello//"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, TextSpan]])
assert_text(spans, [["hello"]])
# Text outside of pair marks should parse on the same level
text = "**hello** world"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
assert_text(spans, [["hello"], " world"])
text = "//hello// world"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
assert_text(spans, [["hello"], " world"])
# Text before, between, and after pair marks should parse
text = "In the **beginning** was //the// Word"
spans = parse_paired_formatting(text)
assert_types(
spans,
[TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan],
)
assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"])
def test_parse_pairs_break():
"""Test pair marks with breaks"""
text: str
spans: Spans
text = r"**glory\\" + "\nhammer**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan]])
assert_text(spans, [["glory\\\\\nhammer"]])
text = r"//glory\\" + "\nhammer//"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, TextSpan]])
assert_text(spans, [["glory\\\\\nhammer"]])
text = r"**glory\\" + "\n**hammer**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
assert_text(spans, [["glory\\\\\n"], "hammer**"])
text = r"//glory\\" + "\n//hammer//"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
assert_text(spans, [["glory\\\\\n"], "hammer//"])
def test_parse_pairs_nested():
"""Test parsing for nesting bold and italic"""
text: str
spans: Spans
# Simple nested test cases
text = "**//hello//**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]])
assert_text(spans, [[["hello"]]])
text = "//**world**//"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]])
assert_text(spans, [[["world"]]])
# Overlap should only parse the first
text = "**Hello//world**//"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
assert_text(spans, [["Hello//world"], "//"])
def test_normalize_title():
"""Test the title normalization used by the citation parser"""
nt = normalize_title
assert nt("hello") == "Hello"
assert nt(" world ") == "World"
assert nt("Waiting for Godot") == "Waiting for Godot"
assert nt("lowercase letters") == "Lowercase letters"
def test_parse_citation_single():
"""Test parsing citations, which have internal formatting"""
text: str
spans: Spans
# Simple test cases
text = "[[hello]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["hello"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "Hello"
text = "[[hello|world]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["hello"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "World"
text = "[[hello||world]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["hello"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "|world"
text = "[[ hello | world ]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [[" hello "]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "World"
text = "[[faith|hope|love]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["faith"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "Hope|love"
text = "[[ [[|]] ]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan], TextSpan])
assert_text(spans, [[" [["], " ]]"])
citation: CitationSpan = spans[0]
assert citation.cite_target == ""
def test_parse_citation_break():
"""Test citations with breaks"""
text: str
spans: Spans
text = "[[hello\\\\\nworld]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["hello\\\\\nworld"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "Hello\\\\ world"
text = "[[one|two\\\\\nthree]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["one"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "Two\\\\ three"
def test_parse_citation_nested():
"""Test nesting with citations"""
text: str
spans: Spans
text = "[[**hello world**]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, [BoldSpan, TextSpan]]])
assert_text(spans, [[["hello world"]]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "**hello world**"
text = "[[**hello|world**]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["**hello"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "World**"
text = "**[[hello world]]**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, [CitationSpan, TextSpan]]])
assert_text(spans, [[["hello world"]]])
citation: CitationSpan = spans[0].spans[0]
assert citation.cite_target == "Hello world"
text = "**[[hello world**]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
assert_text(spans, [["[[hello world"], "]]"])
text = "[[**hello world]]**"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan], TextSpan])
assert_text(spans, [["**hello world"], "**"])
citation: CitationSpan = spans[0]
assert citation.cite_target == "**hello world"
def test_parse_paragraphs():
"""Test parsing paragraphs"""
para: str
span: SpanContainer
# Body paragraph
para = "\tIn the beginning was the Word."
span = parse_paragraph(para)
assert_types([span], [[BodyParagraph, TextSpan]])
assert_text([span], [["In the beginning was the Word."]])
# Signature paragraph
para = "~Ersatz Scrivener, scholar extraordinaire"
span = parse_paragraph(para)
assert_types([span], [[SignatureParagraph, TextSpan]])
assert_text([span], [["Ersatz Scrivener, scholar extraordinaire"]])
def test_parse_article():
"""Test the full article parser"""
article: str = (
"Writing a **unit test** requires having test //content//.\n\n"
"This content, of course, must be [[created|Writing test collateral]].\n\n"
"~Bucky\\\\\nUnit test writer"
)
parsed: ParsedArticle = parse_raw_markdown(article)
assert_types(
[parsed],
[
[
ParsedArticle,
[
BodyParagraph,
TextSpan,
[BoldSpan, TextSpan],
TextSpan,
[ItalicSpan, TextSpan],
TextSpan,
],
[BodyParagraph, TextSpan, [CitationSpan, TextSpan], TextSpan],
[SignatureParagraph, TextSpan, LineBreak, TextSpan],
]
],
)
assert_text(
[parsed],
[
[
[
"Writing a ",
["unit test"],
" requires having test ",
["content"],
".",
],
["This content, of course, must be ", ["created"], "."],
["Bucky", None, "Unit test writer"],
]
],
)
def test_visitor():
"""Test that a visitor dispatches to hooks correctly"""
class TestVisitor(RenderableVisitor):
def __init__(self):
self.visited = []
def TextSpan(self, span: TextSpan):
assert isinstance(span, TextSpan)
self.visited.append(span)
def LineBreak(self, span: LineBreak):
assert isinstance(span, LineBreak)
self.visited.append(span)
def ParsedArticle(self, span: ParsedArticle):
assert isinstance(span, ParsedArticle)
self.visited.append(span)
span.recurse(self)
def BodyParagraph(self, span: BodyParagraph):
assert isinstance(span, BodyParagraph)
self.visited.append(span)
span.recurse(self)
def SignatureParagraph(self, span: SignatureParagraph):
assert isinstance(span, SignatureParagraph)
self.visited.append(span)
span.recurse(self)
def BoldSpan(self, span: BoldSpan):
assert isinstance(span, BoldSpan)
self.visited.append(span)
span.recurse(self)
def ItalicSpan(self, span: ItalicSpan):
assert isinstance(span, ItalicSpan)
self.visited.append(span)
span.recurse(self)
def CitationSpan(self, span: CitationSpan):
assert isinstance(span, CitationSpan)
self.visited.append(span)
span.recurse(self)
article: str = (
"Writing a **unit test** requires having test //content//.\n\n"
"This content, of course, must be [[created|Writing test collateral]].\n\n"
"~Bucky\\\\\nUnit test writer"
)
parsed: ParsedArticle = parse_raw_markdown(article)
visitor = TestVisitor()
# All the typecheck asserts pass
parsed.render(visitor)
# The test article should parse into these spans and visit in this (arbitrary) order
type_order = [
ParsedArticle,
BodyParagraph,
TextSpan,
BoldSpan,
TextSpan,
TextSpan,
ItalicSpan,
TextSpan,
TextSpan,
BodyParagraph,
TextSpan,
CitationSpan,
TextSpan,
TextSpan,
SignatureParagraph,
TextSpan,
LineBreak,
TextSpan,
]
assert len(visitor.visited) == len(type_order)
for span, type in zip(visitor.visited, type_order):
assert isinstance(span, type)