amanuensis/tests/test_parser.py

from typing import Sequence

from amanuensis.parser.core import (
    TextSpan,
    LineBreak,
    ParsedArticle,
    BodyParagraph,
    SignatureParagraph,
    BoldSpan,
    ItalicSpan,
    CitationSpan,
    Renderable,
    SpanContainer,
    RenderableVisitor,
    Spans,
)
from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort
from amanuensis.parser.parsing import (
    parse_breaks,
    parse_paired_formatting,
    parse_paragraph,
    parse_raw_markdown,
)


def assert_types(spans: Spans, types: Sequence, loc=None):
    """
    Asserts that  a span list has the types specified.
    Each element in `types` should be either a span type or a list. The first
    element of the list is the container type and the remaining elements are the
    content types.
    """
    assert len(spans) == len(
        types
    ), f"Unexpected type sequence length at loc {loc if loc else 'root'}"
    i = -1
    for span, span_type in zip(spans, types):
        i += 1
        i_loc = f"{loc}.{i}" if loc else f"{i}"
        if isinstance(span_type, list):
            assert isinstance(
                span, SpanContainer
            ), f"Expected a span container at loc {i_loc}"
            assert (
                len(span.spans) == len(span_type) - 1
            ), f"Unexpected container size at loc {i_loc}"
            assert isinstance(
                span, span_type[0]
            ), f"Unexpected container type at loc {i_loc}"
            assert_types(span.spans, span_type[1:], loc=i_loc)
        else:
            assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}"
            assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}"


def assert_text(spans: Spans, texts: Sequence, loc=None):
    """
    Asserts that a span list has the inner text structure specified.
    Each element in `texts` should be either a string or a list of the same.
    """
    assert len(spans) == len(
        texts
    ), f"Unexpected text sequence length at loc {loc if loc else 'root'}"
    i = -1
    for span, text in zip(spans, texts):
        i += 1
        i_loc = f"{loc}.{i}" if loc else f"{i}"
        if isinstance(text, str):
            assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}"
            assert span.innertext == text, f"Unexpected text at loc {i_loc}"
        elif isinstance(text, list):
            assert isinstance(
                span, SpanContainer
            ), f"Expected a span container at loc {i_loc}"
            assert_text(span.spans, text, loc=i_loc)
        else:
            assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}"


def test_parse_breaks():
    """Test parsing for intra-pragraph line break"""
    text: str
    spans: Spans

    # Only having a line break does nothing
    text = "One\nTwo"
    spans: Spans = parse_breaks(text)
    assert_types(spans, [TextSpan])
    assert_text(spans, [text])

    # Having the mark causes the text to be split across it
    text = r"One\\" + "\nTwo"
    spans: Spans = parse_breaks(text)
    assert_types(spans, [TextSpan, LineBreak, TextSpan])
    assert_text(spans, ["One", None, "Two"])

    # Multiple lines can be broken
    text = r"One\\" + "\n" + r"Two\\" + "\nThree"
    spans: Spans = parse_breaks(text)
    assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan])
    assert_text(spans, ["One", None, "Two", None, "Three"])

    # The mark must be at the end of the line
    text = r"One\\ " + "\nTwo"
    spans: Spans = parse_breaks(text)
    assert_types(spans, (TextSpan,))
    assert_text(spans, [text])


def test_simple_single_parse_pairs():
    """Test parsing for bold and italic marks"""
    text: str
    spans: Spans

    # Empty pair marks should parse
    text = "****"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan]])

    text = "////"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan]])

    # Pair marks with text inside should parse
    text = "**hello**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan]])
    assert_text(spans, [["hello"]])

    text = "//hello//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, TextSpan]])
    assert_text(spans, [["hello"]])

    # Text outside of pair marks should parse on the same level
    text = "**hello** world"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
    assert_text(spans, [["hello"], " world"])

    text = "//hello// world"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
    assert_text(spans, [["hello"], " world"])

    # Text before, between, and after pair marks should parse
    text = "In the **beginning** was //the// Word"
    spans = parse_paired_formatting(text)
    assert_types(
        spans,
        [TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan],
    )
    assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"])


def test_simple_parse_pairs_with_break():
    """Test pair marks with breaks"""
    text: str
    spans: Spans

    text = r"**glory\\" + "\nhammer**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan]])
    assert_text(spans, [["glory\\\\\nhammer"]])

    text = r"//glory\\" + "\nhammer//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, TextSpan]])
    assert_text(spans, [["glory\\\\\nhammer"]])

    text = r"**glory\\" + "\n**hammer**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
    assert_text(spans, [["glory\\\\\n"], "hammer**"])

    text = r"//glory\\" + "\n//hammer//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
    assert_text(spans, [["glory\\\\\n"], "hammer//"])


def test_simple_nested_parse_pairs():
    """Test parsing for nesting bold and italic"""
    text: str
    spans: Spans

    # Simple nested test cases
    text = "**//hello//**"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]])
    assert_text(spans, [[["hello"]]])

    text = "//**world**//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]])
    assert_text(spans, [[["world"]]])

    # Overlap should only parse the first
    text = "**Hello//world**//"
    spans = parse_paired_formatting(text)
    assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
    assert_text(spans, [["Hello//world"], "//"])
Add unit tests for line breaks and simple pairs 2021-06-10 01:20:23 +00:00			`from typing import Sequence`

			`from amanuensis.parser.core import (`
			`TextSpan,`
			`LineBreak,`
			`ParsedArticle,`
			`BodyParagraph,`
			`SignatureParagraph,`
			`BoldSpan,`
			`ItalicSpan,`
			`CitationSpan,`
			`Renderable,`
			`SpanContainer,`
			`RenderableVisitor,`
			`Spans,`
			`)`
			`from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort`
			`from amanuensis.parser.parsing import (`
			`parse_breaks,`
			`parse_paired_formatting,`
			`parse_paragraph,`
			`parse_raw_markdown,`
			`)`


			`def assert_types(spans: Spans, types: Sequence, loc=None):`
			`"""`
			`Asserts that a span list has the types specified.`
			Each element in `types` should be either a span type or a list. The first
			`element of the list is the container type and the remaining elements are the`
			`content types.`
			`"""`
			`assert len(spans) == len(`
			`types`
			`), f"Unexpected type sequence length at loc {loc if loc else 'root'}"`
			`i = -1`
			`for span, span_type in zip(spans, types):`
			`i += 1`
			`i_loc = f"{loc}.{i}" if loc else f"{i}"`
			`if isinstance(span_type, list):`
			`assert isinstance(`
			`span, SpanContainer`
			`), f"Expected a span container at loc {i_loc}"`
			`assert (`
			`len(span.spans) == len(span_type) - 1`
			`), f"Unexpected container size at loc {i_loc}"`
			`assert isinstance(`
			`span, span_type[0]`
			`), f"Unexpected container type at loc {i_loc}"`
			`assert_types(span.spans, span_type[1:], loc=i_loc)`
			`else:`
			`assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}"`
			`assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}"`


			`def assert_text(spans: Spans, texts: Sequence, loc=None):`
			`"""`
			`Asserts that a span list has the inner text structure specified.`
			Each element in `texts` should be either a string or a list of the same.
			`"""`
			`assert len(spans) == len(`
			`texts`
			`), f"Unexpected text sequence length at loc {loc if loc else 'root'}"`
			`i = -1`
			`for span, text in zip(spans, texts):`
			`i += 1`
			`i_loc = f"{loc}.{i}" if loc else f"{i}"`
			`if isinstance(text, str):`
			`assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}"`
			`assert span.innertext == text, f"Unexpected text at loc {i_loc}"`
			`elif isinstance(text, list):`
			`assert isinstance(`
			`span, SpanContainer`
			`), f"Expected a span container at loc {i_loc}"`
			`assert_text(span.spans, text, loc=i_loc)`
			`else:`
			`assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}"`


			`def test_parse_breaks():`
			`"""Test parsing for intra-pragraph line break"""`
			`text: str`
			`spans: Spans`

			`# Only having a line break does nothing`
			`text = "One\nTwo"`
			`spans: Spans = parse_breaks(text)`
			`assert_types(spans, [TextSpan])`
			`assert_text(spans, [text])`

			`# Having the mark causes the text to be split across it`
			`text = r"One\\" + "\nTwo"`
			`spans: Spans = parse_breaks(text)`
			`assert_types(spans, [TextSpan, LineBreak, TextSpan])`
			`assert_text(spans, ["One", None, "Two"])`

			`# Multiple lines can be broken`
			`text = r"One\\" + "\n" + r"Two\\" + "\nThree"`
			`spans: Spans = parse_breaks(text)`
			`assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan])`
			`assert_text(spans, ["One", None, "Two", None, "Three"])`

			`# The mark must be at the end of the line`
			`text = r"One\\ " + "\nTwo"`
			`spans: Spans = parse_breaks(text)`
			`assert_types(spans, (TextSpan,))`
			`assert_text(spans, [text])`


			`def test_simple_single_parse_pairs():`
			`"""Test parsing for bold and italic marks"""`
			`text: str`
			`spans: Spans`

			`# Empty pair marks should parse`
			`text = "****"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[BoldSpan]])`

			`text = "////"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[ItalicSpan]])`

			`# Pair marks with text inside should parse`
			`text = "hello"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[BoldSpan, TextSpan]])`
			`assert_text(spans, [["hello"]])`

			`text = "//hello//"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[ItalicSpan, TextSpan]])`
			`assert_text(spans, [["hello"]])`

			`# Text outside of pair marks should parse on the same level`
			`text = "hello world"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[BoldSpan, TextSpan], TextSpan])`
			`assert_text(spans, [["hello"], " world"])`

			`text = "//hello// world"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])`
			`assert_text(spans, [["hello"], " world"])`

			`# Text before, between, and after pair marks should parse`
			`text = "In the beginning was //the// Word"`
			`spans = parse_paired_formatting(text)`
			`assert_types(`
			`spans,`
			`[TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan],`
			`)`
			`assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"])`


Fix parsing pair marks with line breaks inside 2021-06-10 03:14:32 +00:00			`def test_simple_parse_pairs_with_break():`
			`"""Test pair marks with breaks"""`
			`text: str`
			`spans: Spans`

			`text = r"glory\\" + "\nhammer"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[BoldSpan, TextSpan]])`
			`assert_text(spans, [["glory\\\\\nhammer"]])`

			`text = r"//glory\\" + "\nhammer//"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[ItalicSpan, TextSpan]])`
			`assert_text(spans, [["glory\\\\\nhammer"]])`

			`text = r"glory\\" + "\nhammer**"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[BoldSpan, TextSpan], TextSpan])`
			`assert_text(spans, [["glory\\\\\n"], "hammer**"])`

			`text = r"//glory\\" + "\n//hammer//"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])`
			`assert_text(spans, [["glory\\\\\n"], "hammer//"])`


Add unit tests for line breaks and simple pairs 2021-06-10 01:20:23 +00:00			`def test_simple_nested_parse_pairs():`
			`"""Test parsing for nesting bold and italic"""`
			`text: str`
			`spans: Spans`

			`# Simple nested test cases`
			`text = "//hello//"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]])`
			`assert_text(spans, [[["hello"]]])`

			`text = "//world//"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]])`
			`assert_text(spans, [[["world"]]])`

			`# Overlap should only parse the first`
			`text = "Hello//world//"`
			`spans = parse_paired_formatting(text)`
			`assert_types(spans, [[BoldSpan, TextSpan], TextSpan])`
			`assert_text(spans, [["Hello//world"], "//"])`