From 142ea1a9ba59007ac52cb89d99afe9c12188229c Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 9 Jun 2021 18:20:23 -0700 Subject: [PATCH] Add unit tests for line breaks and simple pairs --- amanuensis/parser/core.py | 16 ++-- amanuensis/parser/parsing.py | 76 +++++++++------ tests/test_parser.py | 176 +++++++++++++++++++++++++++++++++++ 3 files changed, 233 insertions(+), 35 deletions(-) create mode 100644 tests/test_parser.py diff --git a/amanuensis/parser/core.py b/amanuensis/parser/core.py index d50049a..cd1b6a1 100644 --- a/amanuensis/parser/core.py +++ b/amanuensis/parser/core.py @@ -38,14 +38,14 @@ class TextSpan(Renderable): def __init__(self, innertext: str): self.innertext = innertext - def __str__(self): - return f"[{self.innertext}]" + def __repr__(self): + return f"<{self.innertext}>" class LineBreak(Renderable): """A line break within a paragraph.""" - def __str__(self): + def __repr__(self): return "" @@ -55,10 +55,10 @@ class SpanContainer(Renderable): def __init__(self, spans: Spans): self.spans: Spans = spans - def __str__(self): + def __repr__(self): return ( - f"[{type(self).__name__} " - + f'{" ".join([str(span) for span in self.spans])}]' + f"<{type(self).__name__} " + + f'{" ".join([repr(span) for span in self.spans])}>' ) def recurse(self, renderer: "RenderableVisitor"): @@ -94,9 +94,9 @@ class CitationSpan(SpanContainer): # abnormal title strings lying around causing trouble. self.cite_target: str = normalize_title(cite_target) - def __str__(self) -> str: + def __repr__(self) -> str: return ( - f'{{{" ".join([str(span) for span in self.spans])}' + f'{{{" ".join([repr(span) for span in self.spans])}' + f":{self.cite_target}}}" ) diff --git a/amanuensis/parser/parsing.py b/amanuensis/parser/parsing.py index c6bb50b..a16afae 100644 --- a/amanuensis/parser/parsing.py +++ b/amanuensis/parser/parsing.py @@ -34,6 +34,9 @@ def parse_raw_markdown(text: str) -> ParsedArticle: def parse_paragraph(text: str) -> SpanContainer: + """ + Parses a block of text into a paragraph object. + """ # Parse the paragraph as a span of text text = text.strip() if text and text[0] == "~": @@ -44,19 +47,28 @@ def parse_paragraph(text: str) -> SpanContainer: def parse_paired_formatting( text: str, - cite: bool = True, - bold: bool = True, - italic: bool = True, + can_cite: bool = True, + can_bold: bool = True, + can_italic: bool = True, ) -> Spans: + """ + Parses citations, bolds, and italics, which can be nested inside each other. + """ # Find positions of any paired formatting - first_cite = find_pair(text, "[[", "]]", cite) - first_bold = find_pair(text, "**", "**", bold) - first_italic = find_pair(text, "//", "//", italic) + first_cite = find_pair(text, "[[", "]]") if can_cite else -1 + first_bold = find_pair(text, "**", "**") if can_bold else -1 + first_italic = find_pair(text, "//", "//") if can_italic else -1 # Load the possible parse handlers into the map handlers = {} - handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic) - handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic) - handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold) + handlers[first_cite] = lambda: parse_citation( + text, can_bold=can_bold, can_italic=can_italic + ) + handlers[first_bold] = lambda: parse_bold( + text, can_cite=can_cite, can_italic=can_italic + ) + handlers[first_italic] = lambda: parse_italic( + text, can_cite=can_cite, can_bold=can_bold + ) # If nothing was found, move on to the next parsing step handlers[-1] = lambda: parse_breaks(text) # Choose a handler based on the earliest found result @@ -65,15 +77,10 @@ def parse_paired_formatting( return handlers[first]() -def find_pair( - text: str, - open_tag: str, - close_tag: str, - valid: bool, -) -> int: - # If skipping, return -1 - if not valid: - return -1 +def find_pair(text: str, open_tag: str, close_tag: str) -> int: + """ + Finds the beginning of a pair of formatting marks. + """ # If the open tag wasn't found, return -1 first = text.find(open_tag) if first < 0: @@ -88,9 +95,12 @@ def find_pair( def parse_citation( text: str, - bold: bool = True, - italic: bool = True, + can_bold: bool = True, + can_italic: bool = True, ) -> Spans: + """ + Parses text into a citation span. + """ cite_open = text.find("[[") if cite_open > -1: cite_close = text.find("]]", cite_open + 2) @@ -108,7 +118,7 @@ def parse_citation( inner_split = text_inner.split("|", 1) text_inner_actual, cite_target = inner_split[0], inner_split[-1] spans_inner = parse_paired_formatting( - text_inner_actual, cite=False, bold=bold, italic=italic + text_inner_actual, can_cite=False, can_bold=can_bold, can_italic=can_italic ) citation = CitationSpan(spans_inner, cite_target) return [*spans_before, citation, *spans_after] @@ -118,9 +128,12 @@ def parse_citation( def parse_bold( text: str, - cite: bool = True, - italic: bool = True, + can_cite: bool = True, + can_italic: bool = True, ) -> Spans: + """ + Parses text into a bold span. + """ bold_open = text.find("**") if bold_open > -1: bold_close = text.find("**", bold_open + 2) @@ -131,7 +144,7 @@ def parse_bold( # Parse inner text minus bold parsing text_inner = text[bold_open + 2 : bold_close] spans_inner = parse_paired_formatting( - text_inner, cite=cite, bold=False, italic=italic + text_inner, can_cite=can_cite, can_bold=False, can_italic=can_italic ) bold = BoldSpan(spans_inner) return [*spans_before, bold, *spans_after] @@ -141,9 +154,12 @@ def parse_bold( def parse_italic( text: str, - cite: bool = True, - bold: bool = True, + can_cite: bool = True, + can_bold: bool = True, ) -> Spans: + """ + Parses text into an italic span. + """ italic_open = text.find("//") if italic_open > -1: italic_close = text.find("//", italic_open + 2) @@ -154,7 +170,7 @@ def parse_italic( # Parse inner text minus italic parsing text_inner = text[italic_open + 2 : italic_close] spans_inner = parse_paired_formatting( - text_inner, cite=cite, bold=bold, italic=False + text_inner, can_cite=can_cite, can_bold=can_bold, can_italic=False ) italic = ItalicSpan(spans_inner) return [*spans_before, italic, *spans_after] @@ -163,9 +179,15 @@ def parse_italic( def parse_breaks(text: str) -> Spans: + """ + Parses intra-paragraph line breaks. + """ + # Parse empty text into nothing if not text: return [] + # Split on the line break mark appearing at the end of the line splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) + # Put a LineBreak between each TextSpan spans: Spans = [ splits[i // 2] if i % 2 == 0 else LineBreak() for i in range(0, 2 * len(splits) - 1) diff --git a/tests/test_parser.py b/tests/test_parser.py new file mode 100644 index 0000000..5a27765 --- /dev/null +++ b/tests/test_parser.py @@ -0,0 +1,176 @@ +from typing import Sequence + +from amanuensis.parser.core import ( + TextSpan, + LineBreak, + ParsedArticle, + BodyParagraph, + SignatureParagraph, + BoldSpan, + ItalicSpan, + CitationSpan, + Renderable, + SpanContainer, + RenderableVisitor, + Spans, +) +from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort +from amanuensis.parser.parsing import ( + parse_breaks, + parse_paired_formatting, + parse_paragraph, + parse_raw_markdown, +) + + +def assert_types(spans: Spans, types: Sequence, loc=None): + """ + Asserts that a span list has the types specified. + Each element in `types` should be either a span type or a list. The first + element of the list is the container type and the remaining elements are the + content types. + """ + assert len(spans) == len( + types + ), f"Unexpected type sequence length at loc {loc if loc else 'root'}" + i = -1 + for span, span_type in zip(spans, types): + i += 1 + i_loc = f"{loc}.{i}" if loc else f"{i}" + if isinstance(span_type, list): + assert isinstance( + span, SpanContainer + ), f"Expected a span container at loc {i_loc}" + assert ( + len(span.spans) == len(span_type) - 1 + ), f"Unexpected container size at loc {i_loc}" + assert isinstance( + span, span_type[0] + ), f"Unexpected container type at loc {i_loc}" + assert_types(span.spans, span_type[1:], loc=i_loc) + else: + assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}" + assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}" + + +def assert_text(spans: Spans, texts: Sequence, loc=None): + """ + Asserts that a span list has the inner text structure specified. + Each element in `texts` should be either a string or a list of the same. + """ + assert len(spans) == len( + texts + ), f"Unexpected text sequence length at loc {loc if loc else 'root'}" + i = -1 + for span, text in zip(spans, texts): + i += 1 + i_loc = f"{loc}.{i}" if loc else f"{i}" + if isinstance(text, str): + assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}" + assert span.innertext == text, f"Unexpected text at loc {i_loc}" + elif isinstance(text, list): + assert isinstance( + span, SpanContainer + ), f"Expected a span container at loc {i_loc}" + assert_text(span.spans, text, loc=i_loc) + else: + assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}" + + +def test_parse_breaks(): + """Test parsing for intra-pragraph line break""" + text: str + spans: Spans + + # Only having a line break does nothing + text = "One\nTwo" + spans: Spans = parse_breaks(text) + assert_types(spans, [TextSpan]) + assert_text(spans, [text]) + + # Having the mark causes the text to be split across it + text = r"One\\" + "\nTwo" + spans: Spans = parse_breaks(text) + assert_types(spans, [TextSpan, LineBreak, TextSpan]) + assert_text(spans, ["One", None, "Two"]) + + # Multiple lines can be broken + text = r"One\\" + "\n" + r"Two\\" + "\nThree" + spans: Spans = parse_breaks(text) + assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan]) + assert_text(spans, ["One", None, "Two", None, "Three"]) + + # The mark must be at the end of the line + text = r"One\\ " + "\nTwo" + spans: Spans = parse_breaks(text) + assert_types(spans, (TextSpan,)) + assert_text(spans, [text]) + + +def test_simple_single_parse_pairs(): + """Test parsing for bold and italic marks""" + text: str + spans: Spans + + # Empty pair marks should parse + text = "****" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan]]) + + text = "////" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan]]) + + # Pair marks with text inside should parse + text = "**hello**" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, TextSpan]]) + assert_text(spans, [["hello"]]) + + text = "//hello//" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan, TextSpan]]) + assert_text(spans, [["hello"]]) + + # Text outside of pair marks should parse on the same level + text = "**hello** world" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, TextSpan], TextSpan]) + assert_text(spans, [["hello"], " world"]) + + text = "//hello// world" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan, TextSpan], TextSpan]) + assert_text(spans, [["hello"], " world"]) + + # Text before, between, and after pair marks should parse + text = "In the **beginning** was //the// Word" + spans = parse_paired_formatting(text) + assert_types( + spans, + [TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan], + ) + assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"]) + + +def test_simple_nested_parse_pairs(): + """Test parsing for nesting bold and italic""" + text: str + spans: Spans + + # Simple nested test cases + text = "**//hello//**" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]]) + assert_text(spans, [[["hello"]]]) + + text = "//**world**//" + spans = parse_paired_formatting(text) + assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]]) + assert_text(spans, [[["world"]]]) + + # Overlap should only parse the first + text = "**Hello//world**//" + spans = parse_paired_formatting(text) + assert_types(spans, [[BoldSpan, TextSpan], TextSpan]) + assert_text(spans, [["Hello//world"], "//"])