From 142ea1a9ba59007ac52cb89d99afe9c12188229c Mon Sep 17 00:00:00 2001
From: Tim Van Baak <tim.vanbaak@gmail.com>
Date: Wed, 9 Jun 2021 18:20:23 -0700
Subject: [PATCH] Add unit tests for line breaks and simple pairs

---
 amanuensis/parser/core.py    |  16 ++--
 amanuensis/parser/parsing.py |  76 +++++++++------
 tests/test_parser.py         | 176 +++++++++++++++++++++++++++++++++++
 3 files changed, 233 insertions(+), 35 deletions(-)
 create mode 100644 tests/test_parser.py
diff --git a/amanuensis/parser/core.py b/amanuensis/parser/core.py
index d50049a..cd1b6a1 100644
--- a/amanuensis/parser/core.py
+++ b/amanuensis/parser/core.py
@@ -38,14 +38,14 @@ class TextSpan(Renderable):
     def __init__(self, innertext: str):
         self.innertext = innertext
 
-    def __str__(self):
-        return f"[{self.innertext}]"
+    def __repr__(self):
+        return f"<{self.innertext}>"
 
 
 class LineBreak(Renderable):
     """A line break within a paragraph."""
 
-    def __str__(self):
+    def __repr__(self):
         return "<break>"
 
 
@@ -55,10 +55,10 @@ class SpanContainer(Renderable):
     def __init__(self, spans: Spans):
         self.spans: Spans = spans
 
-    def __str__(self):
+    def __repr__(self):
         return (
-            f"[{type(self).__name__} "
-            + f'{" ".join([str(span) for span in self.spans])}]'
+            f"<{type(self).__name__} "
+            + f'{" ".join([repr(span) for span in self.spans])}>'
         )
 
     def recurse(self, renderer: "RenderableVisitor"):
@@ -94,9 +94,9 @@ class CitationSpan(SpanContainer):
         # abnormal title strings lying around causing trouble.
         self.cite_target: str = normalize_title(cite_target)
 
-    def __str__(self) -> str:
+    def __repr__(self) -> str:
         return (
-            f'{{{" ".join([str(span) for span in self.spans])}'
+            f'{{{" ".join([repr(span) for span in self.spans])}'
             + f":{self.cite_target}}}"
         )
 
diff --git a/amanuensis/parser/parsing.py b/amanuensis/parser/parsing.py
index c6bb50b..a16afae 100644
--- a/amanuensis/parser/parsing.py
+++ b/amanuensis/parser/parsing.py
@@ -34,6 +34,9 @@ def parse_raw_markdown(text: str) -> ParsedArticle:
 
 
 def parse_paragraph(text: str) -> SpanContainer:
+    """
+    Parses a block of text into a paragraph object.
+    """
     # Parse the paragraph as a span of text
     text = text.strip()
     if text and text[0] == "~":
@@ -44,19 +47,28 @@ def parse_paragraph(text: str) -> SpanContainer:
 
 def parse_paired_formatting(
     text: str,
-    cite: bool = True,
-    bold: bool = True,
-    italic: bool = True,
+    can_cite: bool = True,
+    can_bold: bool = True,
+    can_italic: bool = True,
 ) -> Spans:
+    """
+    Parses citations, bolds, and italics, which can be nested inside each other.
+    """
     # Find positions of any paired formatting
-    first_cite = find_pair(text, "[[", "]]", cite)
-    first_bold = find_pair(text, "**", "**", bold)
-    first_italic = find_pair(text, "//", "//", italic)
+    first_cite = find_pair(text, "[[", "]]") if can_cite else -1
+    first_bold = find_pair(text, "**", "**") if can_bold else -1
+    first_italic = find_pair(text, "//", "//") if can_italic else -1
     # Load the possible parse handlers into the map
     handlers = {}
-    handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic)
-    handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic)
-    handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold)
+    handlers[first_cite] = lambda: parse_citation(
+        text, can_bold=can_bold, can_italic=can_italic
+    )
+    handlers[first_bold] = lambda: parse_bold(
+        text, can_cite=can_cite, can_italic=can_italic
+    )
+    handlers[first_italic] = lambda: parse_italic(
+        text, can_cite=can_cite, can_bold=can_bold
+    )
     # If nothing was found, move on to the next parsing step
     handlers[-1] = lambda: parse_breaks(text)
     # Choose a handler based on the earliest found result
@@ -65,15 +77,10 @@ def parse_paired_formatting(
     return handlers[first]()
 
 
-def find_pair(
-    text: str,
-    open_tag: str,
-    close_tag: str,
-    valid: bool,
-) -> int:
-    # If skipping, return -1
-    if not valid:
-        return -1
+def find_pair(text: str, open_tag: str, close_tag: str) -> int:
+    """
+    Finds the beginning of a pair of formatting marks.
+    """
     # If the open tag wasn't found, return -1
     first = text.find(open_tag)
     if first < 0:
@@ -88,9 +95,12 @@ def find_pair(
 
 def parse_citation(
     text: str,
-    bold: bool = True,
-    italic: bool = True,
+    can_bold: bool = True,
+    can_italic: bool = True,
 ) -> Spans:
+    """
+    Parses text into a citation span.
+    """
     cite_open = text.find("[[")
     if cite_open > -1:
         cite_close = text.find("]]", cite_open + 2)
@@ -108,7 +118,7 @@ def parse_citation(
         inner_split = text_inner.split("|", 1)
         text_inner_actual, cite_target = inner_split[0], inner_split[-1]
         spans_inner = parse_paired_formatting(
-            text_inner_actual, cite=False, bold=bold, italic=italic
+            text_inner_actual, can_cite=False, can_bold=can_bold, can_italic=can_italic
         )
         citation = CitationSpan(spans_inner, cite_target)
         return [*spans_before, citation, *spans_after]
@@ -118,9 +128,12 @@ def parse_citation(
 
 def parse_bold(
     text: str,
-    cite: bool = True,
-    italic: bool = True,
+    can_cite: bool = True,
+    can_italic: bool = True,
 ) -> Spans:
+    """
+    Parses text into a bold span.
+    """
     bold_open = text.find("**")
     if bold_open > -1:
         bold_close = text.find("**", bold_open + 2)
@@ -131,7 +144,7 @@ def parse_bold(
         # Parse inner text minus bold parsing
         text_inner = text[bold_open + 2 : bold_close]
         spans_inner = parse_paired_formatting(
-            text_inner, cite=cite, bold=False, italic=italic
+            text_inner, can_cite=can_cite, can_bold=False, can_italic=can_italic
         )
         bold = BoldSpan(spans_inner)
         return [*spans_before, bold, *spans_after]
@@ -141,9 +154,12 @@ def parse_bold(
 
 def parse_italic(
     text: str,
-    cite: bool = True,
-    bold: bool = True,
+    can_cite: bool = True,
+    can_bold: bool = True,
 ) -> Spans:
+    """
+    Parses text into an italic span.
+    """
     italic_open = text.find("//")
     if italic_open > -1:
         italic_close = text.find("//", italic_open + 2)
@@ -154,7 +170,7 @@ def parse_italic(
         # Parse inner text minus italic parsing
         text_inner = text[italic_open + 2 : italic_close]
         spans_inner = parse_paired_formatting(
-            text_inner, cite=cite, bold=bold, italic=False
+            text_inner, can_cite=can_cite, can_bold=can_bold, can_italic=False
         )
         italic = ItalicSpan(spans_inner)
         return [*spans_before, italic, *spans_after]
@@ -163,9 +179,15 @@ def parse_italic(
 
 
 def parse_breaks(text: str) -> Spans:
+    """
+    Parses intra-paragraph line breaks.
+    """
+    # Parse empty text into nothing
     if not text:
         return []
+    # Split on the line break mark appearing at the end of the line
     splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
+    # Put a LineBreak between each TextSpan
     spans: Spans = [
         splits[i // 2] if i % 2 == 0 else LineBreak()
         for i in range(0, 2 * len(splits) - 1)
diff --git a/tests/test_parser.py b/tests/test_parser.py
new file mode 100644
index 0000000..5a27765
--- /dev/null
+++ b/tests/test_parser.py
@@ -0,0 +1,176 @@
+from typing import Sequence
+
+from amanuensis.parser.core import (
+    TextSpan,
+    LineBreak,
+    ParsedArticle,
+    BodyParagraph,
+    SignatureParagraph,
+    BoldSpan,
+    ItalicSpan,
+    CitationSpan,
+    Renderable,
+    SpanContainer,
+    RenderableVisitor,
+    Spans,
+)
+from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort
+from amanuensis.parser.parsing import (
+    parse_breaks,
+    parse_paired_formatting,
+    parse_paragraph,
+    parse_raw_markdown,
+)
+
+
+def assert_types(spans: Spans, types: Sequence, loc=None):
+    """
+    Asserts that  a span list has the types specified.
+    Each element in `types` should be either a span type or a list. The first
+    element of the list is the container type and the remaining elements are the
+    content types.
+    """
+    assert len(spans) == len(
+        types
+    ), f"Unexpected type sequence length at loc {loc if loc else 'root'}"
+    i = -1
+    for span, span_type in zip(spans, types):
+        i += 1
+        i_loc = f"{loc}.{i}" if loc else f"{i}"
+        if isinstance(span_type, list):
+            assert isinstance(
+                span, SpanContainer
+            ), f"Expected a span container at loc {i_loc}"
+            assert (
+                len(span.spans) == len(span_type) - 1
+            ), f"Unexpected container size at loc {i_loc}"
+            assert isinstance(
+                span, span_type[0]
+            ), f"Unexpected container type at loc {i_loc}"
+            assert_types(span.spans, span_type[1:], loc=i_loc)
+        else:
+            assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}"
+            assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}"
+
+
+def assert_text(spans: Spans, texts: Sequence, loc=None):
+    """
+    Asserts that a span list has the inner text structure specified.
+    Each element in `texts` should be either a string or a list of the same.
+    """
+    assert len(spans) == len(
+        texts
+    ), f"Unexpected text sequence length at loc {loc if loc else 'root'}"
+    i = -1
+    for span, text in zip(spans, texts):
+        i += 1
+        i_loc = f"{loc}.{i}" if loc else f"{i}"
+        if isinstance(text, str):
+            assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}"
+            assert span.innertext == text, f"Unexpected text at loc {i_loc}"
+        elif isinstance(text, list):
+            assert isinstance(
+                span, SpanContainer
+            ), f"Expected a span container at loc {i_loc}"
+            assert_text(span.spans, text, loc=i_loc)
+        else:
+            assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}"
+
+
+def test_parse_breaks():
+    """Test parsing for intra-pragraph line break"""
+    text: str
+    spans: Spans
+
+    # Only having a line break does nothing
+    text = "One\nTwo"
+    spans: Spans = parse_breaks(text)
+    assert_types(spans, [TextSpan])
+    assert_text(spans, [text])
+
+    # Having the mark causes the text to be split across it
+    text = r"One\\" + "\nTwo"
+    spans: Spans = parse_breaks(text)
+    assert_types(spans, [TextSpan, LineBreak, TextSpan])
+    assert_text(spans, ["One", None, "Two"])
+
+    # Multiple lines can be broken
+    text = r"One\\" + "\n" + r"Two\\" + "\nThree"
+    spans: Spans = parse_breaks(text)
+    assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan])
+    assert_text(spans, ["One", None, "Two", None, "Three"])
+
+    # The mark must be at the end of the line
+    text = r"One\\ " + "\nTwo"
+    spans: Spans = parse_breaks(text)
+    assert_types(spans, (TextSpan,))
+    assert_text(spans, [text])
+
+
+def test_simple_single_parse_pairs():
+    """Test parsing for bold and italic marks"""
+    text: str
+    spans: Spans
+
+    # Empty pair marks should parse
+    text = "****"
+    spans = parse_paired_formatting(text)
+    assert_types(spans, [[BoldSpan]])
+
+    text = "////"
+    spans = parse_paired_formatting(text)
+    assert_types(spans, [[ItalicSpan]])
+
+    # Pair marks with text inside should parse
+    text = "**hello**"
+    spans = parse_paired_formatting(text)
+    assert_types(spans, [[BoldSpan, TextSpan]])
+    assert_text(spans, [["hello"]])
+
+    text = "//hello//"
+    spans = parse_paired_formatting(text)
+    assert_types(spans, [[ItalicSpan, TextSpan]])
+    assert_text(spans, [["hello"]])
+
+    # Text outside of pair marks should parse on the same level
+    text = "**hello** world"
+    spans = parse_paired_formatting(text)
+    assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
+    assert_text(spans, [["hello"], " world"])
+
+    text = "//hello// world"
+    spans = parse_paired_formatting(text)
+    assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
+    assert_text(spans, [["hello"], " world"])
+
+    # Text before, between, and after pair marks should parse
+    text = "In the **beginning** was //the// Word"
+    spans = parse_paired_formatting(text)
+    assert_types(
+        spans,
+        [TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan],
+    )
+    assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"])
+
+
+def test_simple_nested_parse_pairs():
+    """Test parsing for nesting bold and italic"""
+    text: str
+    spans: Spans
+
+    # Simple nested test cases
+    text = "**//hello//**"
+    spans = parse_paired_formatting(text)
+    assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]])
+    assert_text(spans, [[["hello"]]])
+
+    text = "//**world**//"
+    spans = parse_paired_formatting(text)
+    assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]])
+    assert_text(spans, [[["world"]]])
+
+    # Overlap should only parse the first
+    text = "**Hello//world**//"
+    spans = parse_paired_formatting(text)
+    assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
+    assert_text(spans, [["Hello//world"], "//"])