Incorporate parser into new code #12

Merged
Jaculabilis merged 8 commits from tvb/parser into develop 2021-06-12 17:28:19 +00:00
13 changed files with 920 additions and 405 deletions

View File

@ -9,11 +9,94 @@ from amanuensis.config import ReadOnlyOrderedDict
from amanuensis.models import LexiconModel, UserModel from amanuensis.models import LexiconModel, UserModel
from amanuensis.parser import ( from amanuensis.parser import (
parse_raw_markdown, parse_raw_markdown,
GetCitations,
HtmlRenderer,
titlesort, titlesort,
filesafe_title, filesafe_title)
ConstraintAnalysis) from amanuensis.parser.core import RenderableVisitor
class GetCitations(RenderableVisitor):
def __init__(self):
self.citations = []
def ParsedArticle(self, span):
span.recurse(self)
return self.citations
def CitationSpan(self, span):
self.citations.append(span.cite_target)
return self
class ConstraintAnalysis(RenderableVisitor):
def __init__(self, lexicon: LexiconModel):
self.info: List[str] = []
self.warning: List[str] = []
self.error: List[str] = []
self.word_count: int = 0
self.citations: list = []
self.signatures: int = 0
def TextSpan(self, span):
self.word_count += len(re.split(r'\s+', span.innertext.strip()))
return self
def SignatureParagraph(self, span):
self.signatures += 1
span.recurse(self)
return self
def CitationSpan(self, span):
self.citations.append(span.cite_target)
span.recurse(self)
return self
class HtmlRenderer(RenderableVisitor):
"""
Renders an article token tree into published article HTML.
"""
def __init__(self, lexicon_name: str, written_articles: Iterable[str]):
self.lexicon_name: str = lexicon_name
self.written_articles: Iterable[str] = written_articles
def TextSpan(self, span):
return span.innertext
def LineBreak(self, span):
return '<br>'
def ParsedArticle(self, span):
return '\n'.join(span.recurse(self))
def BodyParagraph(self, span):
return f'<p>{"".join(span.recurse(self))}</p>'
def SignatureParagraph(self, span):
return (
'<hr><span class="signature"><p>'
f'{"".join(span.recurse(self))}'
'</p></span>'
)
def BoldSpan(self, span):
return f'<b>{"".join(span.recurse(self))}</b>'
def ItalicSpan(self, span):
return f'<i>{"".join(span.recurse(self))}</i>'
def CitationSpan(self, span):
if span.cite_target in self.written_articles:
link_class = ''
else:
link_class = ' class="phantom"'
# link = url_for(
# 'lexicon.article',
# name=self.lexicon_name,
# title=filesafe_title(span.cite_target))
link = (f'/lexicon/{self.lexicon_name}'
+ f'/article/{filesafe_title(span.cite_target)}')
return f'<a href="{link}"{link_class}>{"".join(span.recurse(self))}</a>'
def get_player_characters( def get_player_characters(

View File

@ -13,7 +13,7 @@
# from amanuensis.config.loader import AttrOrderedDict # from amanuensis.config.loader import AttrOrderedDict
# from amanuensis.errors import ArgumentError # from amanuensis.errors import ArgumentError
# from amanuensis.lexicon import LexiconModel # from amanuensis.lexicon import LexiconModel
# from amanuensis.parser import parse_raw_markdown, GetCitations, HtmlRenderer, filesafe_title, titlesort # from amanuensis.parser import parse_raw_markdown, filesafe_title, titlesort
# from amanuensis.resources import get_stream # from amanuensis.resources import get_stream

View File

@ -2,19 +2,14 @@
Module encapsulating all markdown parsing functionality. Module encapsulating all markdown parsing functionality.
""" """
from .analyze import ConstraintAnalysis, GetCitations from .core import RenderableVisitor
from .core import normalize_title from .helpers import normalize_title, filesafe_title, titlesort
from .helpers import titlesort, filesafe_title
from .parsing import parse_raw_markdown from .parsing import parse_raw_markdown
from .render import PreviewHtmlRenderer, HtmlRenderer
__all__ = [ __all__ = [
ConstraintAnalysis.__name__, "RenderableVisitor",
GetCitations.__name__, "normalize_title",
normalize_title.__name__, "filesafe_title",
titlesort.__name__, "titlesort",
filesafe_title.__name__, "parse_raw_markdown",
parse_raw_markdown.__name__,
PreviewHtmlRenderer.__name__,
HtmlRenderer.__name__,
] ]

View File

@ -1,49 +0,0 @@
"""
Internal module encapsulating visitors that compute metrics on articles
for verification against constraints.
"""
import re
from typing import List
from amanuensis.models import LexiconModel
from .core import RenderableVisitor
class GetCitations(RenderableVisitor):
def __init__(self):
self.citations = []
def ParsedArticle(self, span):
span.recurse(self)
return self.citations
def CitationSpan(self, span):
self.citations.append(span.cite_target)
return self
class ConstraintAnalysis(RenderableVisitor):
def __init__(self, lexicon: LexiconModel):
self.info: List[str] = []
self.warning: List[str] = []
self.error: List[str] = []
self.word_count: int = 0
self.citations: list = []
self.signatures: int = 0
def TextSpan(self, span):
self.word_count += len(re.split(r'\s+', span.innertext.strip()))
return self
def SignatureParagraph(self, span):
self.signatures += 1
span.recurse(self)
return self
def CitationSpan(self, span):
self.citations.append(span.cite_target)
span.recurse(self)
return self

View File

@ -5,131 +5,134 @@ which can be operated on by a visitor defining functions that hook off
of the different token types. of the different token types.
""" """
import re
from typing import Callable, Any, Sequence from typing import Callable, Any, Sequence
RenderHook = Callable[['Renderable'], Any] from .helpers import normalize_title
Spans = Sequence['Renderable']
def normalize_title(title: str) -> str: RenderHook = Callable[["Renderable"], Any]
""" Spans = Sequence["Renderable"]
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r'\s+', " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
class Renderable(): class Renderable:
""" """
Base class for parsed markdown. Provides the `render()` method for Base class for parsed markdown. Provides the `render()` method for
visiting the token tree. visiting the token tree.
""" """
def render(self: 'Renderable', renderer: 'RenderableVisitor'):
""" def render(self: "Renderable", renderer: "RenderableVisitor"):
Execute the apppropriate visitor method on this Renderable. """
""" Execute the apppropriate visitor method on this Renderable.
hook: RenderHook = getattr(renderer, type(self).__name__, None) Visitors implement hooks by declaring methods whose names are
if hook: the name of a Renderable class.
return hook(self) """
return None hook: RenderHook = getattr(renderer, type(self).__name__, None)
if hook:
return hook(self)
return None
class TextSpan(Renderable): class TextSpan(Renderable):
"""An unstyled length of text.""" """A length of text."""
def __init__(self, innertext: str):
self.innertext = innertext
def __str__(self): def __init__(self, innertext: str):
return f"[{self.innertext}]" self.innertext = innertext
def __repr__(self):
return f"<{self.innertext}>"
class LineBreak(Renderable): class LineBreak(Renderable):
"""A line break within a paragraph.""" """A line break within a paragraph."""
def __str__(self):
return "<break>" def __repr__(self):
return "<break>"
class SpanContainer(Renderable): class SpanContainer(Renderable):
"""A formatting element that wraps some amount of text.""" """A formatting element that wraps some amount of text."""
def __init__(self, spans: Spans):
self.spans: Spans = spans
def __str__(self): def __init__(self, spans: Spans):
return (f'[{type(self).__name__} ' self.spans: Spans = spans
+ f'{" ".join([str(span) for span in self.spans])}]')
def recurse(self, renderer: 'RenderableVisitor'): def __repr__(self):
return [child.render(renderer) for child in self.spans] return (
f"<{type(self).__name__} "
+ f'{" ".join([repr(span) for span in self.spans])}>'
)
def recurse(self, renderer: "RenderableVisitor"):
return [child.render(renderer) for child in self.spans]
class ParsedArticle(SpanContainer): class ParsedArticle(SpanContainer):
"""Token tree root node, containing some number of paragraph tokens.""" """Token tree root node, containing some number of paragraph tokens."""
class BodyParagraph(SpanContainer): class BodyParagraph(SpanContainer):
"""A normal paragraph.""" """A normal paragraph."""
class SignatureParagraph(SpanContainer): class SignatureParagraph(SpanContainer):
"""A paragraph preceded by a signature mark.""" """A paragraph preceded by a signature mark."""
class BoldSpan(SpanContainer): class BoldSpan(SpanContainer):
"""A span of text inside bold marks.""" """A span of text inside bold marks."""
class ItalicSpan(SpanContainer): class ItalicSpan(SpanContainer):
"""A span of text inside italic marks.""" """A span of text inside italic marks."""
class CitationSpan(SpanContainer): class CitationSpan(SpanContainer):
"""A citation to another article.""" """A citation to another article."""
def __init__(self, spans: Spans, cite_target: str):
super().__init__(spans)
# Normalize citation target on parse, since we don't want
# abnormal title strings lying around causing trouble.
self.cite_target: str = normalize_title(cite_target)
def __str__(self): def __init__(self, spans: Spans, cite_target: str):
return (f'{{{" ".join([str(span) for span in self.spans])}' super().__init__(spans)
+ f':{self.cite_target}}}') # Normalize citation target on parse, since we don't want
# abnormal title strings lying around causing trouble.
self.cite_target: str = normalize_title(cite_target)
def __repr__(self) -> str:
return (
f'{{{" ".join([repr(span) for span in self.spans])}'
+ f":{self.cite_target}}}"
)
class RenderableVisitor(): class RenderableVisitor:
""" """
Default implementation of the visitor pattern. Executes once on Default implementation of the visitor pattern. Executes once on
each token in the tree and returns itself. each token in the tree and returns itself.
""" """
def TextSpan(self, span: TextSpan):
return self
def LineBreak(self, span: LineBreak): def TextSpan(self, span: TextSpan):
return self return self
def ParsedArticle(self, span: ParsedArticle): def LineBreak(self, span: LineBreak):
span.recurse(self) return self
return self
def BodyParagraph(self, span: BodyParagraph): def ParsedArticle(self, span: ParsedArticle):
span.recurse(self) span.recurse(self)
return self return self
def SignatureParagraph(self, span: SignatureParagraph): def BodyParagraph(self, span: BodyParagraph):
span.recurse(self) span.recurse(self)
return self return self
def BoldSpan(self, span: BoldSpan): def SignatureParagraph(self, span: SignatureParagraph):
span.recurse(self) span.recurse(self)
return self return self
def ItalicSpan(self, span: ItalicSpan): def BoldSpan(self, span: BoldSpan):
span.recurse(self) span.recurse(self)
return self return self
def CitationSpan(self, span: CitationSpan): def ItalicSpan(self, span: ItalicSpan):
span.recurse(self) span.recurse(self)
return self return self
def CitationSpan(self, span: CitationSpan):
span.recurse(self)
return self

View File

@ -1,28 +1,53 @@
"""
Helper functions for manipulating titles during parsing
"""
import re import re
import urllib.parse import urllib.parse
def normalize_title(title: str) -> str:
"""
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r"\s+", " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
def titlesort(title: str) -> str: def titlesort(title: str) -> str:
""" """
Strips articles off of titles for alphabetical sorting purposes Strips articles off of titles for alphabetical sorting purposes
""" """
lower = title.lower() lower = title.lower()
if lower.startswith("the "): if lower.startswith("the "):
return lower[4:] return lower[4:]
if lower.startswith("an "): if lower.startswith("an "):
return lower[3:] return lower[3:]
if lower.startswith("a "): if lower.startswith("a "):
return lower[2:] return lower[2:]
return lower return lower
def filesafe_title(title: str) -> str: def filesafe_title(title: str) -> str:
""" """
Makes an article title filename-safe. Makes an article title filename-safe.
""" """
s = re.sub(r"\s+", '_', title) # Replace whitespace with _ # Replace whitespace with _
s = re.sub(r"~", '-', s) # parse.quote doesn't catch ~ s = re.sub(r"\s+", "_", title)
s = urllib.parse.quote(s) # Encode all other characters
s = re.sub(r"%", "", s) # Strip encoding %s # parse.quote doesn't catch ~
s = s[:64] # Limit to 64 characters s = re.sub(r"~", "-", s)
return s
# Encode all other characters
s = urllib.parse.quote(s)
# Strip encoding %s
s = re.sub(r"%", "", s)
# Limit to 64 characters
s = s[:64]
return s

View File

@ -7,150 +7,203 @@ import re
from typing import Sequence from typing import Sequence
from .core import ( from .core import (
TextSpan, TextSpan,
LineBreak, LineBreak,
ParsedArticle, ParsedArticle,
BodyParagraph, BodyParagraph,
SignatureParagraph, SignatureParagraph,
BoldSpan, BoldSpan,
ItalicSpan, ItalicSpan,
CitationSpan, CitationSpan,
Renderable, Renderable,
SpanContainer SpanContainer,
) )
Spans = Sequence[Renderable] Spans = Sequence[Renderable]
def parse_raw_markdown(text: str) -> ParsedArticle: def parse_raw_markdown(text: str) -> ParsedArticle:
""" """
Parses a body of Lexipython markdown into a Renderable tree. Parses a body of Lexipython markdown into a Renderable tree.
""" """
# Parse each paragraph individually, as no formatting applies # Parse each paragraph individually, as no formatting applies
# across paragraphs # across paragraphs
paragraphs = re.split(r'\n\n+', text) paragraphs = re.split(r"\n\n+", text)
parse_results = list(map(parse_paragraph, paragraphs)) parse_results = list(map(parse_paragraph, paragraphs))
return ParsedArticle(parse_results) return ParsedArticle(parse_results)
def parse_paragraph(text: str) -> SpanContainer: def parse_paragraph(text: str) -> SpanContainer:
# Parse the paragraph as a span of text """
text = text.strip() Parses a block of text into a paragraph object.
if text and text[0] == '~': """
return SignatureParagraph(parse_paired_formatting(text[1:])) # Parse the paragraph as a span of text
else: text = text.strip()
return BodyParagraph(parse_paired_formatting(text)) if text and text[0] == "~":
return SignatureParagraph(parse_paired_formatting(text[1:]))
else:
return BodyParagraph(parse_paired_formatting(text))
def parse_paired_formatting( def parse_paired_formatting(
text: str, text: str,
cite: bool = True, in_cite: bool = False,
bold: bool = True, in_bold: bool = False,
italic: bool = True) -> Spans: in_italic: bool = False,
# Find positions of any paired formatting ) -> Spans:
first_cite = find_pair(text, "[[", "]]", cite) """
first_bold = find_pair(text, "**", "**", bold) Parses citations, bolds, and italics, which can be nested inside each other.
first_italic = find_pair(text, "//", "//", italic) A single type cannot nest inside itself, which is controlled by setting the
# Load the possible parse handlers into the map flag parameters to False.
handlers = {} """
handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic) # Find positions of any paired formatting
handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic) next_cite = find_pair(text, "[[", "]]") if not in_cite else -1
handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold) next_bold = find_pair(text, "**", "**") if not in_bold else -1
# If nothing was found, move on to the next parsing step next_italic = find_pair(text, "//", "//") if not in_italic else -1
handlers[-1] = lambda: parse_breaks(text) # Create a map from a formatting mark's distance to its parse handler
# Choose a handler based on the earliest found result handlers = {}
finds = [i for i in (first_cite, first_bold, first_italic) if i > -1] handlers[next_cite] = lambda: parse_citation(
first = min(finds) if finds else -1 text, in_bold=in_bold, in_italic=in_italic
return handlers[first]() )
handlers[next_bold] = lambda: parse_bold(text, in_cite=in_cite, in_italic=in_italic)
handlers[next_italic] = lambda: parse_italic(text, in_cite=in_cite, in_bold=in_bold)
# Map the next parsing step at -1. If we're currently inside a formatting
# mark pair, skip parsing line breaks, which are not allowed inside paired
# marks.
if in_cite or in_bold or in_italic:
handlers[-1] = lambda: parse_text(text)
else:
handlers[-1] = lambda: parse_breaks(text)
# Choose the handler for the earliest found pair, or the default handler
# at -1 if nothing was found.
finds = [i for i in (next_cite, next_bold, next_italic) if i > -1]
first = min(finds) if finds else -1
return handlers[first]()
def find_pair( def find_pair(text: str, open_tag: str, close_tag: str) -> int:
text: str, """
open_tag: str, Finds the beginning of a pair of formatting marks.
close_tag: str, """
valid: bool) -> int: # If the open tag wasn't found, return -1
# If skipping, return -1 first = text.find(open_tag)
if not valid: if first < 0:
return -1 return -1
# If the open tag wasn't found, return -1 # If the close tag wasn't found after the open tag, return -1
first = text.find(open_tag) second = text.find(close_tag, first + len(open_tag))
if first < 0: if second < 0:
return -1 return -1
# If the close tag wasn't found after the open tag, return -1 # Otherwise, the pair exists
second = text.find(close_tag, first + len(open_tag)) return first
if second < 0:
return -1
# Otherwise, the pair exists
return first
def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans: def parse_citation(
cite_open = text.find("[[") text: str,
if cite_open > -1: in_bold: bool = False,
cite_close = text.find("]]", cite_open + 2) in_italic: bool = False,
# Since we searched for pairs from the beginning, there should be no ) -> Spans:
# undetected pair formatting before this one, so move to the next """
# level of parsing Parses text into a citation span.
spans_before = parse_breaks(text[:cite_open]) """
# Continue parsing pair formatting after this one closes with all cite_open = text.find("[[")
# three as valid choices if cite_open > -1:
spans_after = parse_paired_formatting(text[cite_close + 2:]) cite_close = text.find("]]", cite_open + 2)
# Parse inner text and skip parsing for this format pair # Since we searched for pairs from the beginning, there should be no
text_inner = text[cite_open + 2:cite_close] # undetected pair formatting before this one, so move to the next
# For citations specifically, we may need to split off a citation # level of parsing
# target from the alias text spans_before = parse_breaks(text[:cite_open])
inner_split = text_inner.split("|", 1) # Continue parsing pair formatting after this one closes with all
text_inner_actual, cite_target = inner_split[0], inner_split[-1] # three as valid choices
spans_inner = parse_paired_formatting(text_inner_actual, spans_after = parse_paired_formatting(text[cite_close + 2 :])
cite=False, bold=bold, italic=italic) # Parse inner text and skip parsing for this format pair
citation = CitationSpan(spans_inner, cite_target) text_inner = text[cite_open + 2 : cite_close]
return [*spans_before, citation, *spans_after] # For citations specifically, try to split off a citation target.
# Should never happen # If there's no citation target to split, use the same text as the
return parse_breaks(text) # citation text and the target.
inner_split = text_inner.split("|", 1)
text_inner_actual, cite_target = inner_split[0], inner_split[-1]
spans_inner = parse_paired_formatting(
text_inner_actual, in_cite=True, in_bold=in_bold, in_italic=in_italic
)
citation = CitationSpan(spans_inner, cite_target)
return [*spans_before, citation, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans: def parse_bold(
bold_open = text.find("**") text: str,
if bold_open > -1: in_cite: bool = False,
bold_close = text.find("**", bold_open + 2) in_italic: bool = False,
# Should be no formatting behind us ) -> Spans:
spans_before = parse_breaks(text[:bold_open]) """
# Freely parse formatting after us Parses text into a bold span.
spans_after = parse_paired_formatting(text[bold_close + 2:]) """
# Parse inner text minus bold parsing bold_open = text.find("**")
text_inner = text[bold_open + 2:bold_close] if bold_open > -1:
spans_inner = parse_paired_formatting(text_inner, bold_close = text.find("**", bold_open + 2)
cite=cite, bold=False, italic=italic) # Should be no formatting behind us
bold = BoldSpan(spans_inner) spans_before = parse_breaks(text[:bold_open])
return [*spans_before, bold, *spans_after] # Freely parse formatting after us
# Should never happen spans_after = parse_paired_formatting(text[bold_close + 2 :])
return parse_italic(text) # Parse inner text minus bold parsing
text_inner = text[bold_open + 2 : bold_close]
spans_inner = parse_paired_formatting(
text_inner, in_cite=in_cite, in_bold=True, in_italic=in_italic
)
bold = BoldSpan(spans_inner)
return [*spans_before, bold, *spans_after]
# Should never happen
return parse_italic(text)
def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans: def parse_italic(
italic_open = text.find("//") text: str,
if italic_open > -1: in_cite: bool = False,
italic_close = text.find("//", italic_open + 2) in_bold: bool = False,
# Should be no formatting behind us ) -> Spans:
spans_before = parse_breaks(text[:italic_open]) """
# Freely parse formatting after us Parses text into an italic span.
spans_after = parse_paired_formatting(text[italic_close + 2:]) """
# Parse inner text minus italic parsing italic_open = text.find("//")
text_inner = text[italic_open + 2:italic_close] if italic_open > -1:
spans_inner = parse_paired_formatting(text_inner, italic_close = text.find("//", italic_open + 2)
cite=cite, bold=bold, italic=False) # Should be no formatting behind us
italic = ItalicSpan(spans_inner) spans_before = parse_breaks(text[:italic_open])
return [*spans_before, italic, *spans_after] # Freely parse formatting after us
# Should never happen spans_after = parse_paired_formatting(text[italic_close + 2 :])
return parse_breaks(text) # Parse inner text minus italic parsing
text_inner = text[italic_open + 2 : italic_close]
spans_inner = parse_paired_formatting(
text_inner, in_cite=in_cite, in_bold=in_bold, in_italic=True
)
italic = ItalicSpan(spans_inner)
return [*spans_before, italic, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_breaks(text: str) -> Spans: def parse_breaks(text: str) -> Spans:
if not text: """
return [] Parses intra-paragraph line breaks.
splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) """
spans: Spans = [ # Parse empty text into nothing
splits[i // 2] if i % 2 == 0 else LineBreak() if not text:
for i in range(0, 2 * len(splits) - 1) return []
] # Split on the line break mark appearing at the end of the line
return spans splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
# Put a LineBreak between each TextSpan
spans: Spans = [
splits[i // 2] if i % 2 == 0 else LineBreak()
for i in range(0, 2 * len(splits) - 1)
]
return spans
def parse_text(text: str) -> Spans:
"""
Parses text with no remaining parseable marks.
"""
if not text:
return []
return [TextSpan(text)]

View File

@ -1,104 +0,0 @@
"""
Internal module encapsulating visitors that render articles into
readable formats.
"""
from typing import Iterable
from .core import RenderableVisitor
from .helpers import filesafe_title
class HtmlRenderer(RenderableVisitor):
"""
Renders an article token tree into published article HTML.
"""
def __init__(self, lexicon_name: str, written_articles: Iterable[str]):
self.lexicon_name: str = lexicon_name
self.written_articles: Iterable[str] = written_articles
def TextSpan(self, span):
return span.innertext
def LineBreak(self, span):
return '<br>'
def ParsedArticle(self, span):
return '\n'.join(span.recurse(self))
def BodyParagraph(self, span):
return f'<p>{"".join(span.recurse(self))}</p>'
def SignatureParagraph(self, span):
return (
'<hr><span class="signature"><p>'
f'{"".join(span.recurse(self))}'
'</p></span>'
)
def BoldSpan(self, span):
return f'<b>{"".join(span.recurse(self))}</b>'
def ItalicSpan(self, span):
return f'<i>{"".join(span.recurse(self))}</i>'
def CitationSpan(self, span):
if span.cite_target in self.written_articles:
link_class = ''
else:
link_class = ' class="phantom"'
# link = url_for(
# 'lexicon.article',
# name=self.lexicon_name,
# title=filesafe_title(span.cite_target))
link = (f'/lexicon/{self.lexicon_name}'
+ f'/article/{filesafe_title(span.cite_target)}')
return f'<a href="{link}"{link_class}>{"".join(span.recurse(self))}</a>'
class PreviewHtmlRenderer(RenderableVisitor):
def __init__(self, lexicon):
with lexicon.ctx.read('info') as info:
self.article_map = {
title: article.character
for title, article in info.items()
}
self.citations = []
self.contents = ""
def TextSpan(self, span):
return span.innertext
def LineBreak(self, span):
return '<br>'
def ParsedArticle(self, span):
self.contents = '\n'.join(span.recurse(self))
return self
def BodyParagraph(self, span):
return f'<p>{"".join(span.recurse(self))}</p>'
def SignatureParagraph(self, span):
return (
'<hr><span class="signature"><p>'
f'{"".join(span.recurse(self))}'
'</p></span>'
)
def BoldSpan(self, span):
return f'<b>{"".join(span.recurse(self))}</b>'
def ItalicSpan(self, span):
return f'<i>{"".join(span.recurse(self))}</i>'
def CitationSpan(self, span):
if span.cite_target in self.article_map:
if self.article_map.get(span.cite_target):
link_class = '[extant]'
else:
link_class = '[phantom]'
else:
link_class = '[new]'
self.citations.append(f'{span.cite_target} {link_class}')
return f'<u>{"".join(span.recurse(self))}</u>[{len(self.citations)}]'

View File

@ -15,9 +15,7 @@ from amanuensis.lexicon import (
create_character_in_lexicon, create_character_in_lexicon,
get_draft) get_draft)
from amanuensis.models import LexiconModel from amanuensis.models import LexiconModel
from amanuensis.parser import ( from amanuensis.parser import parse_raw_markdown
parse_raw_markdown,
PreviewHtmlRenderer)
from amanuensis.server.helpers import ( from amanuensis.server.helpers import (
lexicon_param, lexicon_param,
player_required, player_required,
@ -29,7 +27,7 @@ from .forms import (
LexiconPublishTurnForm, LexiconPublishTurnForm,
LexiconConfigForm) LexiconConfigForm)
from .editor import load_editor, new_draft, update_draft from .editor import load_editor, new_draft, update_draft, PreviewHtmlRenderer
bp_session = Blueprint('session', __name__, bp_session = Blueprint('session', __name__,

View File

@ -17,8 +17,56 @@ from amanuensis.lexicon import (
from amanuensis.models import LexiconModel from amanuensis.models import LexiconModel
from amanuensis.parser import ( from amanuensis.parser import (
normalize_title, normalize_title,
parse_raw_markdown, parse_raw_markdown)
PreviewHtmlRenderer) from amanuensis.parser.core import RenderableVisitor
class PreviewHtmlRenderer(RenderableVisitor):
def __init__(self, lexicon):
with lexicon.ctx.read('info') as info:
self.article_map = {
title: article.character
for title, article in info.items()
}
self.citations = []
self.contents = ""
def TextSpan(self, span):
return span.innertext
def LineBreak(self, span):
return '<br>'
def ParsedArticle(self, span):
self.contents = '\n'.join(span.recurse(self))
return self
def BodyParagraph(self, span):
return f'<p>{"".join(span.recurse(self))}</p>'
def SignatureParagraph(self, span):
return (
'<hr><span class="signature"><p>'
f'{"".join(span.recurse(self))}'
'</p></span>'
)
def BoldSpan(self, span):
return f'<b>{"".join(span.recurse(self))}</b>'
def ItalicSpan(self, span):
return f'<i>{"".join(span.recurse(self))}</i>'
def CitationSpan(self, span):
if span.cite_target in self.article_map:
if self.article_map.get(span.cite_target):
link_class = '[extant]'
else:
link_class = '[phantom]'
else:
link_class = '[new]'
self.citations.append(f'{span.cite_target} {link_class}')
return f'<u>{"".join(span.recurse(self))}</u>[{len(self.citations)}]'
def load_editor(lexicon: LexiconModel, aid: str): def load_editor(lexicon: LexiconModel, aid: str):

View File

@ -1,4 +1,4 @@
[mypy] [mypy]
ignore_missing_imports = true ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
; mypy stable doesn't support pyproject.toml yet ; mypy stable doesn't support pyproject.toml yet

View File

@ -17,11 +17,11 @@ black = "^21.5b2"
mypy = "^0.812" mypy = "^0.812"
[tool.black] [tool.black]
extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/parser/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py" extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
[tool.mypy] [tool.mypy]
ignore_missing_imports = true ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
[tool.pytest.ini_options] [tool.pytest.ini_options]
addopts = "--show-capture=log" addopts = "--show-capture=log"

463
tests/test_parser.py Normal file
View File

@ -0,0 +1,463 @@
from typing import Sequence
from amanuensis.parser.core import (
TextSpan,
LineBreak,
ParsedArticle,
BodyParagraph,
SignatureParagraph,
BoldSpan,
ItalicSpan,
CitationSpan,
Renderable,
SpanContainer,
RenderableVisitor,
Spans,
)
from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort
from amanuensis.parser.parsing import (
parse_breaks,
parse_paired_formatting,
parse_paragraph,
parse_raw_markdown,
)
def assert_types(spans: Spans, types: Sequence, loc=None):
"""
Asserts that a span list has the types specified.
Each element in `types` should be either a span type or a list. The first
element of the list is the container type and the remaining elements are the
content types.
"""
for i in range(max(len(spans), len(types))):
i_loc = f"{loc}.{i}" if loc else f"{i}"
# Check lengths are equal
assert i < len(spans), f"Span list unexpectedly short at {i_loc}"
assert i < len(types), f"Type list unexpectedly short at {i_loc}"
# Check types are equal
span, span_type = spans[i], types[i]
if isinstance(span_type, list):
assert isinstance(
span, SpanContainer
), f"Expected a span container at loc {i_loc}"
assert (
len(span.spans) == len(span_type) - 1
), f"Unexpected container size at loc {i_loc}"
assert isinstance(
span, span_type[0]
), f"Unexpected container type at loc {i_loc}"
assert_types(span.spans, span_type[1:], loc=i_loc)
else:
assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}"
assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}"
def assert_text(spans: Spans, texts: Sequence, loc=None):
"""
Asserts that a span list has the inner text structure specified.
Each element in `texts` should be either a string or a list of the same.
"""
assert len(spans) == len(
texts
), f"Unexpected text sequence length at loc {loc if loc else 'root'}"
i = -1
for span, text in zip(spans, texts):
i += 1
i_loc = f"{loc}.{i}" if loc else f"{i}"
if isinstance(text, str):
assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}"
assert span.innertext == text, f"Unexpected text at loc {i_loc}"
elif isinstance(text, list):
assert isinstance(
span, SpanContainer
), f"Expected a span container at loc {i_loc}"
assert_text(span.spans, text, loc=i_loc)
else:
assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}"
def test_parse_breaks():
"""Test parsing for intra-pragraph line break"""
text: str
spans: Spans
# Only having a line break does nothing
text = "One\nTwo"
spans: Spans = parse_breaks(text)
assert_types(spans, [TextSpan])
assert_text(spans, [text])
# Having the mark causes the text to be split across it
text = r"One\\" + "\nTwo"
spans: Spans = parse_breaks(text)
assert_types(spans, [TextSpan, LineBreak, TextSpan])
assert_text(spans, ["One", None, "Two"])
# Multiple lines can be broken
text = r"One\\" + "\n" + r"Two\\" + "\nThree"
spans: Spans = parse_breaks(text)
assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan])
assert_text(spans, ["One", None, "Two", None, "Three"])
# The mark must be at the end of the line
text = r"One\\ " + "\nTwo"
spans: Spans = parse_breaks(text)
assert_types(spans, (TextSpan,))
assert_text(spans, [text])
def test_parse_pairs_single():
"""Test parsing for bold and italic marks"""
text: str
spans: Spans
# Empty pair marks should parse
text = "****"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan]])
text = "////"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan]])
# Pair marks with text inside should parse
text = "**hello**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan]])
assert_text(spans, [["hello"]])
text = "//hello//"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, TextSpan]])
assert_text(spans, [["hello"]])
# Text outside of pair marks should parse on the same level
text = "**hello** world"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
assert_text(spans, [["hello"], " world"])
text = "//hello// world"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
assert_text(spans, [["hello"], " world"])
# Text before, between, and after pair marks should parse
text = "In the **beginning** was //the// Word"
spans = parse_paired_formatting(text)
assert_types(
spans,
[TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan],
)
assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"])
def test_parse_pairs_break():
"""Test pair marks with breaks"""
text: str
spans: Spans
text = r"**glory\\" + "\nhammer**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan]])
assert_text(spans, [["glory\\\\\nhammer"]])
text = r"//glory\\" + "\nhammer//"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, TextSpan]])
assert_text(spans, [["glory\\\\\nhammer"]])
text = r"**glory\\" + "\n**hammer**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
assert_text(spans, [["glory\\\\\n"], "hammer**"])
text = r"//glory\\" + "\n//hammer//"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
assert_text(spans, [["glory\\\\\n"], "hammer//"])
def test_parse_pairs_nested():
"""Test parsing for nesting bold and italic"""
text: str
spans: Spans
# Simple nested test cases
text = "**//hello//**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]])
assert_text(spans, [[["hello"]]])
text = "//**world**//"
spans = parse_paired_formatting(text)
assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]])
assert_text(spans, [[["world"]]])
# Overlap should only parse the first
text = "**Hello//world**//"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
assert_text(spans, [["Hello//world"], "//"])
def test_normalize_title():
"""Test the title normalization used by the citation parser"""
nt = normalize_title
assert nt("hello") == "Hello"
assert nt(" world ") == "World"
assert nt("Waiting for Godot") == "Waiting for Godot"
assert nt("lowercase letters") == "Lowercase letters"
def test_parse_citation_single():
"""Test parsing citations, which have internal formatting"""
text: str
spans: Spans
# Simple test cases
text = "[[hello]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["hello"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "Hello"
text = "[[hello|world]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["hello"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "World"
text = "[[hello||world]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["hello"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "|world"
text = "[[ hello | world ]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [[" hello "]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "World"
text = "[[faith|hope|love]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["faith"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "Hope|love"
text = "[[ [[|]] ]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan], TextSpan])
assert_text(spans, [[" [["], " ]]"])
citation: CitationSpan = spans[0]
assert citation.cite_target == ""
def test_parse_citation_break():
"""Test citations with breaks"""
text: str
spans: Spans
text = "[[hello\\\\\nworld]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["hello\\\\\nworld"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "Hello\\\\ world"
text = "[[one|two\\\\\nthree]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["one"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "Two\\\\ three"
def test_parse_citation_nested():
"""Test nesting with citations"""
text: str
spans: Spans
text = "[[**hello world**]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, [BoldSpan, TextSpan]]])
assert_text(spans, [[["hello world"]]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "**hello world**"
text = "[[**hello|world**]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan]])
assert_text(spans, [["**hello"]])
citation: CitationSpan = spans[0]
assert citation.cite_target == "World**"
text = "**[[hello world]]**"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, [CitationSpan, TextSpan]]])
assert_text(spans, [[["hello world"]]])
citation: CitationSpan = spans[0].spans[0]
assert citation.cite_target == "Hello world"
text = "**[[hello world**]]"
spans = parse_paired_formatting(text)
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
assert_text(spans, [["[[hello world"], "]]"])
text = "[[**hello world]]**"
spans = parse_paired_formatting(text)
assert_types(spans, [[CitationSpan, TextSpan], TextSpan])
assert_text(spans, [["**hello world"], "**"])
citation: CitationSpan = spans[0]
assert citation.cite_target == "**hello world"
def test_parse_paragraphs():
"""Test parsing paragraphs"""
para: str
span: SpanContainer
# Body paragraph
para = "\tIn the beginning was the Word."
span = parse_paragraph(para)
assert_types([span], [[BodyParagraph, TextSpan]])
assert_text([span], [["In the beginning was the Word."]])
# Signature paragraph
para = "~Ersatz Scrivener, scholar extraordinaire"
span = parse_paragraph(para)
assert_types([span], [[SignatureParagraph, TextSpan]])
assert_text([span], [["Ersatz Scrivener, scholar extraordinaire"]])
def test_parse_article():
"""Test the full article parser"""
article: str = (
"Writing a **unit test** requires having test //content//.\n\n"
"This content, of course, must be [[created|Writing test collateral]].\n\n"
"~Bucky\\\\\nUnit test writer"
)
parsed: ParsedArticle = parse_raw_markdown(article)
assert_types(
[parsed],
[
[
ParsedArticle,
[
BodyParagraph,
TextSpan,
[BoldSpan, TextSpan],
TextSpan,
[ItalicSpan, TextSpan],
TextSpan,
],
[BodyParagraph, TextSpan, [CitationSpan, TextSpan], TextSpan],
[SignatureParagraph, TextSpan, LineBreak, TextSpan],
]
],
)
assert_text(
[parsed],
[
[
[
"Writing a ",
["unit test"],
" requires having test ",
["content"],
".",
],
["This content, of course, must be ", ["created"], "."],
["Bucky", None, "Unit test writer"],
]
],
)
def test_visitor():
"""Test that a visitor dispatches to hooks correctly"""
class TestVisitor(RenderableVisitor):
def __init__(self):
self.visited = []
def TextSpan(self, span: TextSpan):
assert isinstance(span, TextSpan)
self.visited.append(span)
def LineBreak(self, span: LineBreak):
assert isinstance(span, LineBreak)
self.visited.append(span)
def ParsedArticle(self, span: ParsedArticle):
assert isinstance(span, ParsedArticle)
self.visited.append(span)
span.recurse(self)
def BodyParagraph(self, span: BodyParagraph):
assert isinstance(span, BodyParagraph)
self.visited.append(span)
span.recurse(self)
def SignatureParagraph(self, span: SignatureParagraph):
assert isinstance(span, SignatureParagraph)
self.visited.append(span)
span.recurse(self)
def BoldSpan(self, span: BoldSpan):
assert isinstance(span, BoldSpan)
self.visited.append(span)
span.recurse(self)
def ItalicSpan(self, span: ItalicSpan):
assert isinstance(span, ItalicSpan)
self.visited.append(span)
span.recurse(self)
def CitationSpan(self, span: CitationSpan):
assert isinstance(span, CitationSpan)
self.visited.append(span)
span.recurse(self)
article: str = (
"Writing a **unit test** requires having test //content//.\n\n"
"This content, of course, must be [[created|Writing test collateral]].\n\n"
"~Bucky\\\\\nUnit test writer"
)
parsed: ParsedArticle = parse_raw_markdown(article)
visitor = TestVisitor()
# All the typecheck asserts pass
parsed.render(visitor)
# The test article should parse into these spans and visit in this (arbitrary) order
type_order = [
ParsedArticle,
BodyParagraph,
TextSpan,
BoldSpan,
TextSpan,
TextSpan,
ItalicSpan,
TextSpan,
TextSpan,
BodyParagraph,
TextSpan,
CitationSpan,
TextSpan,
TextSpan,
SignatureParagraph,
TextSpan,
LineBreak,
TextSpan,
]
assert len(visitor.visited) == len(type_order)
for span, type in zip(visitor.visited, type_order):
assert isinstance(span, type)