464 lines
15 KiB
Python
464 lines
15 KiB
Python
from typing import Sequence
|
|
|
|
from amanuensis.parser.core import (
|
|
TextSpan,
|
|
LineBreak,
|
|
ParsedArticle,
|
|
BodyParagraph,
|
|
SignatureParagraph,
|
|
BoldSpan,
|
|
ItalicSpan,
|
|
CitationSpan,
|
|
Renderable,
|
|
SpanContainer,
|
|
RenderableVisitor,
|
|
Spans,
|
|
)
|
|
from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort
|
|
from amanuensis.parser.parsing import (
|
|
parse_breaks,
|
|
parse_paired_formatting,
|
|
parse_paragraph,
|
|
parse_raw_markdown,
|
|
)
|
|
|
|
|
|
def assert_types(spans: Spans, types: Sequence, loc=None):
|
|
"""
|
|
Asserts that a span list has the types specified.
|
|
Each element in `types` should be either a span type or a list. The first
|
|
element of the list is the container type and the remaining elements are the
|
|
content types.
|
|
"""
|
|
for i in range(max(len(spans), len(types))):
|
|
i_loc = f"{loc}.{i}" if loc else f"{i}"
|
|
# Check lengths are equal
|
|
assert i < len(spans), f"Span list unexpectedly short at {i_loc}"
|
|
assert i < len(types), f"Type list unexpectedly short at {i_loc}"
|
|
# Check types are equal
|
|
span, span_type = spans[i], types[i]
|
|
if isinstance(span_type, list):
|
|
assert isinstance(
|
|
span, SpanContainer
|
|
), f"Expected a span container at loc {i_loc}"
|
|
assert (
|
|
len(span.spans) == len(span_type) - 1
|
|
), f"Unexpected container size at loc {i_loc}"
|
|
assert isinstance(
|
|
span, span_type[0]
|
|
), f"Unexpected container type at loc {i_loc}"
|
|
assert_types(span.spans, span_type[1:], loc=i_loc)
|
|
else:
|
|
assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}"
|
|
assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}"
|
|
|
|
|
|
def assert_text(spans: Spans, texts: Sequence, loc=None):
|
|
"""
|
|
Asserts that a span list has the inner text structure specified.
|
|
Each element in `texts` should be either a string or a list of the same.
|
|
"""
|
|
assert len(spans) == len(
|
|
texts
|
|
), f"Unexpected text sequence length at loc {loc if loc else 'root'}"
|
|
i = -1
|
|
for span, text in zip(spans, texts):
|
|
i += 1
|
|
i_loc = f"{loc}.{i}" if loc else f"{i}"
|
|
if isinstance(text, str):
|
|
assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}"
|
|
assert span.innertext == text, f"Unexpected text at loc {i_loc}"
|
|
elif isinstance(text, list):
|
|
assert isinstance(
|
|
span, SpanContainer
|
|
), f"Expected a span container at loc {i_loc}"
|
|
assert_text(span.spans, text, loc=i_loc)
|
|
else:
|
|
assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}"
|
|
|
|
|
|
def test_parse_breaks():
|
|
"""Test parsing for intra-pragraph line break"""
|
|
text: str
|
|
spans: Spans
|
|
|
|
# Only having a line break does nothing
|
|
text = "One\nTwo"
|
|
spans: Spans = parse_breaks(text)
|
|
assert_types(spans, [TextSpan])
|
|
assert_text(spans, [text])
|
|
|
|
# Having the mark causes the text to be split across it
|
|
text = r"One\\" + "\nTwo"
|
|
spans: Spans = parse_breaks(text)
|
|
assert_types(spans, [TextSpan, LineBreak, TextSpan])
|
|
assert_text(spans, ["One", None, "Two"])
|
|
|
|
# Multiple lines can be broken
|
|
text = r"One\\" + "\n" + r"Two\\" + "\nThree"
|
|
spans: Spans = parse_breaks(text)
|
|
assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan])
|
|
assert_text(spans, ["One", None, "Two", None, "Three"])
|
|
|
|
# The mark must be at the end of the line
|
|
text = r"One\\ " + "\nTwo"
|
|
spans: Spans = parse_breaks(text)
|
|
assert_types(spans, (TextSpan,))
|
|
assert_text(spans, [text])
|
|
|
|
|
|
def test_parse_pairs_single():
|
|
"""Test parsing for bold and italic marks"""
|
|
text: str
|
|
spans: Spans
|
|
|
|
# Empty pair marks should parse
|
|
text = "****"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[BoldSpan]])
|
|
|
|
text = "////"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[ItalicSpan]])
|
|
|
|
# Pair marks with text inside should parse
|
|
text = "**hello**"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[BoldSpan, TextSpan]])
|
|
assert_text(spans, [["hello"]])
|
|
|
|
text = "//hello//"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[ItalicSpan, TextSpan]])
|
|
assert_text(spans, [["hello"]])
|
|
|
|
# Text outside of pair marks should parse on the same level
|
|
text = "**hello** world"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
|
|
assert_text(spans, [["hello"], " world"])
|
|
|
|
text = "//hello// world"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
|
|
assert_text(spans, [["hello"], " world"])
|
|
|
|
# Text before, between, and after pair marks should parse
|
|
text = "In the **beginning** was //the// Word"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(
|
|
spans,
|
|
[TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan],
|
|
)
|
|
assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"])
|
|
|
|
|
|
def test_parse_pairs_break():
|
|
"""Test pair marks with breaks"""
|
|
text: str
|
|
spans: Spans
|
|
|
|
text = r"**glory\\" + "\nhammer**"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[BoldSpan, TextSpan]])
|
|
assert_text(spans, [["glory\\\\\nhammer"]])
|
|
|
|
text = r"//glory\\" + "\nhammer//"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[ItalicSpan, TextSpan]])
|
|
assert_text(spans, [["glory\\\\\nhammer"]])
|
|
|
|
text = r"**glory\\" + "\n**hammer**"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
|
|
assert_text(spans, [["glory\\\\\n"], "hammer**"])
|
|
|
|
text = r"//glory\\" + "\n//hammer//"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
|
|
assert_text(spans, [["glory\\\\\n"], "hammer//"])
|
|
|
|
|
|
def test_parse_pairs_nested():
|
|
"""Test parsing for nesting bold and italic"""
|
|
text: str
|
|
spans: Spans
|
|
|
|
# Simple nested test cases
|
|
text = "**//hello//**"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]])
|
|
assert_text(spans, [[["hello"]]])
|
|
|
|
text = "//**world**//"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]])
|
|
assert_text(spans, [[["world"]]])
|
|
|
|
# Overlap should only parse the first
|
|
text = "**Hello//world**//"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
|
|
assert_text(spans, [["Hello//world"], "//"])
|
|
|
|
|
|
def test_normalize_title():
|
|
"""Test the title normalization used by the citation parser"""
|
|
nt = normalize_title
|
|
assert nt("hello") == "Hello"
|
|
assert nt(" world ") == "World"
|
|
assert nt("Waiting for Godot") == "Waiting for Godot"
|
|
assert nt("lowercase letters") == "Lowercase letters"
|
|
|
|
|
|
def test_parse_citation_single():
|
|
"""Test parsing citations, which have internal formatting"""
|
|
text: str
|
|
spans: Spans
|
|
|
|
# Simple test cases
|
|
text = "[[hello]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan]])
|
|
assert_text(spans, [["hello"]])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "Hello"
|
|
|
|
text = "[[hello|world]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan]])
|
|
assert_text(spans, [["hello"]])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "World"
|
|
|
|
text = "[[hello||world]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan]])
|
|
assert_text(spans, [["hello"]])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "|world"
|
|
|
|
text = "[[ hello | world ]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan]])
|
|
assert_text(spans, [[" hello "]])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "World"
|
|
|
|
text = "[[faith|hope|love]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan]])
|
|
assert_text(spans, [["faith"]])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "Hope|love"
|
|
|
|
text = "[[ [[|]] ]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan], TextSpan])
|
|
assert_text(spans, [[" [["], " ]]"])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == ""
|
|
|
|
|
|
def test_parse_citation_break():
|
|
"""Test citations with breaks"""
|
|
text: str
|
|
spans: Spans
|
|
|
|
text = "[[hello\\\\\nworld]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan]])
|
|
assert_text(spans, [["hello\\\\\nworld"]])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "Hello\\\\ world"
|
|
|
|
text = "[[one|two\\\\\nthree]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan]])
|
|
assert_text(spans, [["one"]])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "Two\\\\ three"
|
|
|
|
|
|
def test_parse_citation_nested():
|
|
"""Test nesting with citations"""
|
|
text: str
|
|
spans: Spans
|
|
|
|
text = "[[**hello world**]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, [BoldSpan, TextSpan]]])
|
|
assert_text(spans, [[["hello world"]]])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "**hello world**"
|
|
|
|
text = "[[**hello|world**]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan]])
|
|
assert_text(spans, [["**hello"]])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "World**"
|
|
|
|
text = "**[[hello world]]**"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[BoldSpan, [CitationSpan, TextSpan]]])
|
|
assert_text(spans, [[["hello world"]]])
|
|
citation: CitationSpan = spans[0].spans[0]
|
|
assert citation.cite_target == "Hello world"
|
|
|
|
text = "**[[hello world**]]"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
|
|
assert_text(spans, [["[[hello world"], "]]"])
|
|
|
|
text = "[[**hello world]]**"
|
|
spans = parse_paired_formatting(text)
|
|
assert_types(spans, [[CitationSpan, TextSpan], TextSpan])
|
|
assert_text(spans, [["**hello world"], "**"])
|
|
citation: CitationSpan = spans[0]
|
|
assert citation.cite_target == "**hello world"
|
|
|
|
|
|
def test_parse_paragraphs():
|
|
"""Test parsing paragraphs"""
|
|
para: str
|
|
span: SpanContainer
|
|
|
|
# Body paragraph
|
|
para = "\tIn the beginning was the Word."
|
|
span = parse_paragraph(para)
|
|
assert_types([span], [[BodyParagraph, TextSpan]])
|
|
assert_text([span], [["In the beginning was the Word."]])
|
|
|
|
# Signature paragraph
|
|
para = "~Ersatz Scrivener, scholar extraordinaire"
|
|
span = parse_paragraph(para)
|
|
assert_types([span], [[SignatureParagraph, TextSpan]])
|
|
assert_text([span], [["Ersatz Scrivener, scholar extraordinaire"]])
|
|
|
|
|
|
def test_parse_article():
|
|
"""Test the full article parser"""
|
|
article: str = (
|
|
"Writing a **unit test** requires having test //content//.\n\n"
|
|
"This content, of course, must be [[created|Writing test collateral]].\n\n"
|
|
"~Bucky\\\\\nUnit test writer"
|
|
)
|
|
parsed: ParsedArticle = parse_raw_markdown(article)
|
|
|
|
assert_types(
|
|
[parsed],
|
|
[
|
|
[
|
|
ParsedArticle,
|
|
[
|
|
BodyParagraph,
|
|
TextSpan,
|
|
[BoldSpan, TextSpan],
|
|
TextSpan,
|
|
[ItalicSpan, TextSpan],
|
|
TextSpan,
|
|
],
|
|
[BodyParagraph, TextSpan, [CitationSpan, TextSpan], TextSpan],
|
|
[SignatureParagraph, TextSpan, LineBreak, TextSpan],
|
|
]
|
|
],
|
|
)
|
|
assert_text(
|
|
[parsed],
|
|
[
|
|
[
|
|
[
|
|
"Writing a ",
|
|
["unit test"],
|
|
" requires having test ",
|
|
["content"],
|
|
".",
|
|
],
|
|
["This content, of course, must be ", ["created"], "."],
|
|
["Bucky", None, "Unit test writer"],
|
|
]
|
|
],
|
|
)
|
|
|
|
|
|
def test_visitor():
|
|
"""Test that a visitor dispatches to hooks correctly"""
|
|
|
|
class TestVisitor(RenderableVisitor):
|
|
def __init__(self):
|
|
self.visited = []
|
|
|
|
def TextSpan(self, span: TextSpan):
|
|
assert isinstance(span, TextSpan)
|
|
self.visited.append(span)
|
|
|
|
def LineBreak(self, span: LineBreak):
|
|
assert isinstance(span, LineBreak)
|
|
self.visited.append(span)
|
|
|
|
def ParsedArticle(self, span: ParsedArticle):
|
|
assert isinstance(span, ParsedArticle)
|
|
self.visited.append(span)
|
|
span.recurse(self)
|
|
|
|
def BodyParagraph(self, span: BodyParagraph):
|
|
assert isinstance(span, BodyParagraph)
|
|
self.visited.append(span)
|
|
span.recurse(self)
|
|
|
|
def SignatureParagraph(self, span: SignatureParagraph):
|
|
assert isinstance(span, SignatureParagraph)
|
|
self.visited.append(span)
|
|
span.recurse(self)
|
|
|
|
def BoldSpan(self, span: BoldSpan):
|
|
assert isinstance(span, BoldSpan)
|
|
self.visited.append(span)
|
|
span.recurse(self)
|
|
|
|
def ItalicSpan(self, span: ItalicSpan):
|
|
assert isinstance(span, ItalicSpan)
|
|
self.visited.append(span)
|
|
span.recurse(self)
|
|
|
|
def CitationSpan(self, span: CitationSpan):
|
|
assert isinstance(span, CitationSpan)
|
|
self.visited.append(span)
|
|
span.recurse(self)
|
|
|
|
article: str = (
|
|
"Writing a **unit test** requires having test //content//.\n\n"
|
|
"This content, of course, must be [[created|Writing test collateral]].\n\n"
|
|
"~Bucky\\\\\nUnit test writer"
|
|
)
|
|
parsed: ParsedArticle = parse_raw_markdown(article)
|
|
|
|
visitor = TestVisitor()
|
|
# All the typecheck asserts pass
|
|
parsed.render(visitor)
|
|
# The test article should parse into these spans and visit in this (arbitrary) order
|
|
type_order = [
|
|
ParsedArticle,
|
|
BodyParagraph,
|
|
TextSpan,
|
|
BoldSpan,
|
|
TextSpan,
|
|
TextSpan,
|
|
ItalicSpan,
|
|
TextSpan,
|
|
TextSpan,
|
|
BodyParagraph,
|
|
TextSpan,
|
|
CitationSpan,
|
|
TextSpan,
|
|
TextSpan,
|
|
SignatureParagraph,
|
|
TextSpan,
|
|
LineBreak,
|
|
TextSpan,
|
|
]
|
|
assert len(visitor.visited) == len(type_order)
|
|
for span, type in zip(visitor.visited, type_order):
|
|
assert isinstance(span, type)
|