amanuensis/tests/test_parser.py

from typing import Sequence

from amanuensis.parser.core import (
    TextSpan,
    LineBreak,
    ParsedArticle,
    BodyParagraph,
    SignatureParagraph,
    BoldSpan,
    ItalicSpan,
    CitationSpan,
    Renderable,
    SpanContainer,
    RenderableVisitor,
    Spans,
)
from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort
from amanuensis.parser.parsing import (
    parse_breaks,
    parse_paired_formatting,
    parse_paragraph,
    parse_raw_markdown,
)


def assert_types(spans: Spans, types: Sequence, loc=None):
    """
    Asserts that a span list has the types specified.
    Each element in `types` should be either a span type or a list. The first
    element of the list is the container type and the remaining elements are the
    content types.
    """
    for i in range(max(len(spans), len(types))):
        i_loc = f"{loc}.{i}" if loc else f"{i}"
        # Check lengths are equal
        assert i < len(spans), f"Span list unexpectedly short at {i_loc}"
        assert i < len(types), f"Type list unexpectedly short at {i_loc}"
        # Check types are equal
        span, span_type = spans[i], types[i]
        if isinstance(span_type, list):
            assert isinstance(
                span, SpanContainer
            ), f"Expected a span container at loc {i_loc}"
            assert (
                len(span.spans) == len(span_type) - 1
            ), f"Unexpected container size at loc {i_loc}"
            assert isinstance(
                span, span_type[0]
            ), f"Unexpected container type at loc {i_loc}"
            assert_types(span.spans, span_type[1:], loc=i_loc)
        else:
            assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}"
            assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}"


def assert_text(spans: Spans, texts: Sequence, loc=None):
    """
    Asserts that a span list has the inner text structure specified.
    Each element in `texts` should be either a string or a list of the same.
    """
    assert len(spans) == len(
        texts
    ), f"Unexpected text sequence length at loc {loc if loc else 'root'}"
    i = -1
    for span, text in zip(spans, texts):
        i += 1
        i_loc = f"{loc}.{i}" if loc else f"{i}"
        if isinstance(text, str):
            assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}"
            assert span.innertext == text, f"Unexpected text at loc {i_loc}"
        elif isinstance(text, list):
            assert isinstance(
                span, SpanContainer
            ), f"Expected a span container at loc {i_loc}"
            assert_text(span.spans, text, loc=i_loc)
        else:
            assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}"


def test_parse_breaks():
    """Test parsing for intra-pragraph line break"""
    text: str
    spans: Spans

    # Only having a line break does nothing
    text = "One\nTwo"
    spans: Spans = parse_breaks(text)
    assert_types(spans, [TextSpan])
    assert_text(spans, [text])

    # Having the mark causes the text to be split across it
    text = r"One\\" + "\nTwo"
    spans: Spans = parse_breaks(text)
    assert_types(spans, [TextSpan, LineBreak, TextSpan])
    assert_text(spans, ["One", None, "Two"])

    # Multiple lines can be broken
    text = r"One\\" + "\n" + r"Two\\" + "\nThree"
    spans: Spans = parse_breaks(text)
    assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan])
    assert_text(spans, ["One", None, "Two", None, "Three"])

    # The mark must be at the end of the line
    text = r"One\\ " + "\nTwo"
    spans: Spans = parse_breaks(text)
    assert_types(spans, (TextSpan,))
    assert_text(spans, [text])


def test_parse_pairs_single():
    """Test parsing for bold and italic marks"""
    text: str
    spans: Spans

    # Empty pair marks should parse
    text = "****"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan]])

    text = "////"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan]])

    # Pair marks with text inside should parse
    text = "**hello**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan]])
    assert_text(spans, [["hello"]])

    text = "//hello//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, TextSpan]])
    assert_text(spans, [["hello"]])

    # Text outside of pair marks should parse on the same level
    text = "**hello** world"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
    assert_text(spans, [["hello"], " world"])

    text = "//hello// world"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
    assert_text(spans, [["hello"], " world"])

    # Text before, between, and after pair marks should parse
    text = "In the **beginning** was //the// Word"
    spans = parse_paired_formatting(text)
    assert_types(
        spans,
        [TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan],
    )
    assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"])


def test_parse_pairs_break():
    """Test pair marks with breaks"""
    text: str
    spans: Spans

    text = r"**glory\\" + "\nhammer**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan]])
    assert_text(spans, [["glory\\\\\nhammer"]])

    text = r"//glory\\" + "\nhammer//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, TextSpan]])
    assert_text(spans, [["glory\\\\\nhammer"]])

    text = r"**glory\\" + "\n**hammer**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
    assert_text(spans, [["glory\\\\\n"], "hammer**"])

    text = r"//glory\\" + "\n//hammer//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
    assert_text(spans, [["glory\\\\\n"], "hammer//"])


def test_parse_pairs_nested():
    """Test parsing for nesting bold and italic"""
    text: str
    spans: Spans

    # Simple nested test cases
    text = "**//hello//**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]])
    assert_text(spans, [[["hello"]]])

    text = "//**world**//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]])
    assert_text(spans, [[["world"]]])

    # Overlap should only parse the first
    text = "**Hello//world**//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
    assert_text(spans, [["Hello//world"], "//"])


def test_normalize_title():
    """Test the title normalization used by the citation parser"""
    nt = normalize_title
    assert nt("hello") == "Hello"
    assert nt("  world  ") == "World"
    assert nt("Waiting for           Godot") == "Waiting for Godot"
    assert nt("lowercase letters") == "Lowercase letters"


def test_parse_citation_single():
    """Test parsing citations, which have internal formatting"""
    text: str
    spans: Spans

    # Simple test cases
    text = "[[hello]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan]])
    assert_text(spans, [["hello"]])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "Hello"

    text = "[[hello|world]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan]])
    assert_text(spans, [["hello"]])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "World"

    text = "[[hello||world]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan]])
    assert_text(spans, [["hello"]])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "|world"

    text = "[[  hello  |  world  ]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan]])
    assert_text(spans, [["  hello  "]])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "World"

    text = "[[faith|hope|love]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan]])
    assert_text(spans, [["faith"]])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "Hope|love"

    text = "[[ [[|]] ]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan], TextSpan])
    assert_text(spans, [[" [["], " ]]"])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == ""


def test_parse_citation_break():
    """Test citations with breaks"""
    text: str
    spans: Spans

    text = "[[hello\\\\\nworld]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan]])
    assert_text(spans, [["hello\\\\\nworld"]])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "Hello\\\\ world"

    text = "[[one|two\\\\\nthree]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan]])
    assert_text(spans, [["one"]])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "Two\\\\ three"


def test_parse_citation_nested():
    """Test nesting with citations"""
    text: str
    spans: Spans

    text = "[[**hello world**]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, [BoldSpan, TextSpan]]])
    assert_text(spans, [[["hello world"]]])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "**hello world**"

    text = "[[**hello|world**]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan]])
    assert_text(spans, [["**hello"]])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "World**"

    text = "**[[hello world]]**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, [CitationSpan, TextSpan]]])
    assert_text(spans, [[["hello world"]]])
    citation: CitationSpan = spans[0].spans[0]
    assert citation.cite_target == "Hello world"

    text = "**[[hello world**]]"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
    assert_text(spans, [["[[hello world"], "]]"])

    text = "[[**hello world]]**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[CitationSpan, TextSpan], TextSpan])
    assert_text(spans, [["**hello world"], "**"])
    citation: CitationSpan = spans[0]
    assert citation.cite_target == "**hello world"


def test_parse_paragraphs():
    """Test parsing paragraphs"""
    para: str
    span: SpanContainer

    # Body paragraph
    para = "\tIn the beginning was the Word."
    span = parse_paragraph(para)
    assert_types([span], [[BodyParagraph, TextSpan]])
    assert_text([span], [["In the beginning was the Word."]])

    # Signature paragraph
    para = "~Ersatz Scrivener, scholar extraordinaire"
    span = parse_paragraph(para)
    assert_types([span], [[SignatureParagraph, TextSpan]])
    assert_text([span], [["Ersatz Scrivener, scholar extraordinaire"]])


def test_parse_article():
    """Test the full article parser"""
    article: str = (
        "Writing a **unit test** requires having test //content//.\n\n"
        "This content, of course, must be [[created|Writing test collateral]].\n\n"
        "~Bucky\\\\\nUnit test writer"
    )
    parsed: ParsedArticle = parse_raw_markdown(article)

    assert_types(
        [parsed],
        [
            [
                ParsedArticle,
                [
                    BodyParagraph,
                    TextSpan,
                    [BoldSpan, TextSpan],
                    TextSpan,
                    [ItalicSpan, TextSpan],
                    TextSpan,
                ],
                [BodyParagraph, TextSpan, [CitationSpan, TextSpan], TextSpan],
                [SignatureParagraph, TextSpan, LineBreak, TextSpan],
            ]
        ],
    )
    assert_text(
        [parsed],
        [
            [
                [
                    "Writing a ",
                    ["unit test"],
                    " requires having test ",
                    ["content"],
                    ".",
                ],
                ["This content, of course, must be ", ["created"], "."],
                ["Bucky", None, "Unit test writer"],
            ]
        ],
    )


def test_visitor():
    """Test that a visitor dispatches to hooks correctly"""

    class TestVisitor(RenderableVisitor):
        def __init__(self):
            self.visited = []

        def TextSpan(self, span: TextSpan):
            assert isinstance(span, TextSpan)
            self.visited.append(span)

        def LineBreak(self, span: LineBreak):
            assert isinstance(span, LineBreak)
            self.visited.append(span)

        def ParsedArticle(self, span: ParsedArticle):
            assert isinstance(span, ParsedArticle)
            self.visited.append(span)
            span.recurse(self)

        def BodyParagraph(self, span: BodyParagraph):
            assert isinstance(span, BodyParagraph)
            self.visited.append(span)
            span.recurse(self)

        def SignatureParagraph(self, span: SignatureParagraph):
            assert isinstance(span, SignatureParagraph)
            self.visited.append(span)
            span.recurse(self)

        def BoldSpan(self, span: BoldSpan):
            assert isinstance(span, BoldSpan)
            self.visited.append(span)
            span.recurse(self)

        def ItalicSpan(self, span: ItalicSpan):
            assert isinstance(span, ItalicSpan)
            self.visited.append(span)
            span.recurse(self)

        def CitationSpan(self, span: CitationSpan):
            assert isinstance(span, CitationSpan)
            self.visited.append(span)
            span.recurse(self)

    article: str = (
        "Writing a **unit test** requires having test //content//.\n\n"
        "This content, of course, must be [[created|Writing test collateral]].\n\n"
        "~Bucky\\\\\nUnit test writer"
    )
    parsed: ParsedArticle = parse_raw_markdown(article)

    visitor = TestVisitor()
    # All the typecheck asserts pass
    parsed.render(visitor)
    # The test article should parse into these spans and visit in this (arbitrary) order
    type_order = [
        ParsedArticle,
        BodyParagraph,
        TextSpan,
        BoldSpan,
        TextSpan,
        TextSpan,
        ItalicSpan,
        TextSpan,
        TextSpan,
        BodyParagraph,
        TextSpan,
        CitationSpan,
        TextSpan,
        TextSpan,
        SignatureParagraph,
        TextSpan,
        LineBreak,
        TextSpan,
    ]
    assert len(visitor.visited) == len(type_order)
    for span, type in zip(visitor.visited, type_order):
        assert isinstance(span, type)