Add unit tests for line breaks and simple pairs
This commit is contained in:
parent
1c55d866a8
commit
7a847e96d3
|
@ -38,14 +38,14 @@ class TextSpan(Renderable):
|
||||||
def __init__(self, innertext: str):
|
def __init__(self, innertext: str):
|
||||||
self.innertext = innertext
|
self.innertext = innertext
|
||||||
|
|
||||||
def __str__(self):
|
def __repr__(self):
|
||||||
return f"[{self.innertext}]"
|
return f"<{self.innertext}>"
|
||||||
|
|
||||||
|
|
||||||
class LineBreak(Renderable):
|
class LineBreak(Renderable):
|
||||||
"""A line break within a paragraph."""
|
"""A line break within a paragraph."""
|
||||||
|
|
||||||
def __str__(self):
|
def __repr__(self):
|
||||||
return "<break>"
|
return "<break>"
|
||||||
|
|
||||||
|
|
||||||
|
@ -55,10 +55,10 @@ class SpanContainer(Renderable):
|
||||||
def __init__(self, spans: Spans):
|
def __init__(self, spans: Spans):
|
||||||
self.spans: Spans = spans
|
self.spans: Spans = spans
|
||||||
|
|
||||||
def __str__(self):
|
def __repr__(self):
|
||||||
return (
|
return (
|
||||||
f"[{type(self).__name__} "
|
f"<{type(self).__name__} "
|
||||||
+ f'{" ".join([str(span) for span in self.spans])}]'
|
+ f'{" ".join([repr(span) for span in self.spans])}>'
|
||||||
)
|
)
|
||||||
|
|
||||||
def recurse(self, renderer: "RenderableVisitor"):
|
def recurse(self, renderer: "RenderableVisitor"):
|
||||||
|
@ -94,9 +94,9 @@ class CitationSpan(SpanContainer):
|
||||||
# abnormal title strings lying around causing trouble.
|
# abnormal title strings lying around causing trouble.
|
||||||
self.cite_target: str = normalize_title(cite_target)
|
self.cite_target: str = normalize_title(cite_target)
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return (
|
return (
|
||||||
f'{{{" ".join([str(span) for span in self.spans])}'
|
f'{{{" ".join([repr(span) for span in self.spans])}'
|
||||||
+ f":{self.cite_target}}}"
|
+ f":{self.cite_target}}}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,9 @@ def parse_raw_markdown(text: str) -> ParsedArticle:
|
||||||
|
|
||||||
|
|
||||||
def parse_paragraph(text: str) -> SpanContainer:
|
def parse_paragraph(text: str) -> SpanContainer:
|
||||||
|
"""
|
||||||
|
Parses a block of text into a paragraph object.
|
||||||
|
"""
|
||||||
# Parse the paragraph as a span of text
|
# Parse the paragraph as a span of text
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
if text and text[0] == "~":
|
if text and text[0] == "~":
|
||||||
|
@ -44,19 +47,28 @@ def parse_paragraph(text: str) -> SpanContainer:
|
||||||
|
|
||||||
def parse_paired_formatting(
|
def parse_paired_formatting(
|
||||||
text: str,
|
text: str,
|
||||||
cite: bool = True,
|
can_cite: bool = True,
|
||||||
bold: bool = True,
|
can_bold: bool = True,
|
||||||
italic: bool = True,
|
can_italic: bool = True,
|
||||||
) -> Spans:
|
) -> Spans:
|
||||||
|
"""
|
||||||
|
Parses citations, bolds, and italics, which can be nested inside each other.
|
||||||
|
"""
|
||||||
# Find positions of any paired formatting
|
# Find positions of any paired formatting
|
||||||
first_cite = find_pair(text, "[[", "]]", cite)
|
first_cite = find_pair(text, "[[", "]]") if can_cite else -1
|
||||||
first_bold = find_pair(text, "**", "**", bold)
|
first_bold = find_pair(text, "**", "**") if can_bold else -1
|
||||||
first_italic = find_pair(text, "//", "//", italic)
|
first_italic = find_pair(text, "//", "//") if can_italic else -1
|
||||||
# Load the possible parse handlers into the map
|
# Load the possible parse handlers into the map
|
||||||
handlers = {}
|
handlers = {}
|
||||||
handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic)
|
handlers[first_cite] = lambda: parse_citation(
|
||||||
handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic)
|
text, can_bold=can_bold, can_italic=can_italic
|
||||||
handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold)
|
)
|
||||||
|
handlers[first_bold] = lambda: parse_bold(
|
||||||
|
text, can_cite=can_cite, can_italic=can_italic
|
||||||
|
)
|
||||||
|
handlers[first_italic] = lambda: parse_italic(
|
||||||
|
text, can_cite=can_cite, can_bold=can_bold
|
||||||
|
)
|
||||||
# If nothing was found, move on to the next parsing step
|
# If nothing was found, move on to the next parsing step
|
||||||
handlers[-1] = lambda: parse_breaks(text)
|
handlers[-1] = lambda: parse_breaks(text)
|
||||||
# Choose a handler based on the earliest found result
|
# Choose a handler based on the earliest found result
|
||||||
|
@ -65,15 +77,10 @@ def parse_paired_formatting(
|
||||||
return handlers[first]()
|
return handlers[first]()
|
||||||
|
|
||||||
|
|
||||||
def find_pair(
|
def find_pair(text: str, open_tag: str, close_tag: str) -> int:
|
||||||
text: str,
|
"""
|
||||||
open_tag: str,
|
Finds the beginning of a pair of formatting marks.
|
||||||
close_tag: str,
|
"""
|
||||||
valid: bool,
|
|
||||||
) -> int:
|
|
||||||
# If skipping, return -1
|
|
||||||
if not valid:
|
|
||||||
return -1
|
|
||||||
# If the open tag wasn't found, return -1
|
# If the open tag wasn't found, return -1
|
||||||
first = text.find(open_tag)
|
first = text.find(open_tag)
|
||||||
if first < 0:
|
if first < 0:
|
||||||
|
@ -88,9 +95,12 @@ def find_pair(
|
||||||
|
|
||||||
def parse_citation(
|
def parse_citation(
|
||||||
text: str,
|
text: str,
|
||||||
bold: bool = True,
|
can_bold: bool = True,
|
||||||
italic: bool = True,
|
can_italic: bool = True,
|
||||||
) -> Spans:
|
) -> Spans:
|
||||||
|
"""
|
||||||
|
Parses text into a citation span.
|
||||||
|
"""
|
||||||
cite_open = text.find("[[")
|
cite_open = text.find("[[")
|
||||||
if cite_open > -1:
|
if cite_open > -1:
|
||||||
cite_close = text.find("]]", cite_open + 2)
|
cite_close = text.find("]]", cite_open + 2)
|
||||||
|
@ -108,7 +118,7 @@ def parse_citation(
|
||||||
inner_split = text_inner.split("|", 1)
|
inner_split = text_inner.split("|", 1)
|
||||||
text_inner_actual, cite_target = inner_split[0], inner_split[-1]
|
text_inner_actual, cite_target = inner_split[0], inner_split[-1]
|
||||||
spans_inner = parse_paired_formatting(
|
spans_inner = parse_paired_formatting(
|
||||||
text_inner_actual, cite=False, bold=bold, italic=italic
|
text_inner_actual, can_cite=False, can_bold=can_bold, can_italic=can_italic
|
||||||
)
|
)
|
||||||
citation = CitationSpan(spans_inner, cite_target)
|
citation = CitationSpan(spans_inner, cite_target)
|
||||||
return [*spans_before, citation, *spans_after]
|
return [*spans_before, citation, *spans_after]
|
||||||
|
@ -118,9 +128,12 @@ def parse_citation(
|
||||||
|
|
||||||
def parse_bold(
|
def parse_bold(
|
||||||
text: str,
|
text: str,
|
||||||
cite: bool = True,
|
can_cite: bool = True,
|
||||||
italic: bool = True,
|
can_italic: bool = True,
|
||||||
) -> Spans:
|
) -> Spans:
|
||||||
|
"""
|
||||||
|
Parses text into a bold span.
|
||||||
|
"""
|
||||||
bold_open = text.find("**")
|
bold_open = text.find("**")
|
||||||
if bold_open > -1:
|
if bold_open > -1:
|
||||||
bold_close = text.find("**", bold_open + 2)
|
bold_close = text.find("**", bold_open + 2)
|
||||||
|
@ -131,7 +144,7 @@ def parse_bold(
|
||||||
# Parse inner text minus bold parsing
|
# Parse inner text minus bold parsing
|
||||||
text_inner = text[bold_open + 2 : bold_close]
|
text_inner = text[bold_open + 2 : bold_close]
|
||||||
spans_inner = parse_paired_formatting(
|
spans_inner = parse_paired_formatting(
|
||||||
text_inner, cite=cite, bold=False, italic=italic
|
text_inner, can_cite=can_cite, can_bold=False, can_italic=can_italic
|
||||||
)
|
)
|
||||||
bold = BoldSpan(spans_inner)
|
bold = BoldSpan(spans_inner)
|
||||||
return [*spans_before, bold, *spans_after]
|
return [*spans_before, bold, *spans_after]
|
||||||
|
@ -141,9 +154,12 @@ def parse_bold(
|
||||||
|
|
||||||
def parse_italic(
|
def parse_italic(
|
||||||
text: str,
|
text: str,
|
||||||
cite: bool = True,
|
can_cite: bool = True,
|
||||||
bold: bool = True,
|
can_bold: bool = True,
|
||||||
) -> Spans:
|
) -> Spans:
|
||||||
|
"""
|
||||||
|
Parses text into an italic span.
|
||||||
|
"""
|
||||||
italic_open = text.find("//")
|
italic_open = text.find("//")
|
||||||
if italic_open > -1:
|
if italic_open > -1:
|
||||||
italic_close = text.find("//", italic_open + 2)
|
italic_close = text.find("//", italic_open + 2)
|
||||||
|
@ -154,7 +170,7 @@ def parse_italic(
|
||||||
# Parse inner text minus italic parsing
|
# Parse inner text minus italic parsing
|
||||||
text_inner = text[italic_open + 2 : italic_close]
|
text_inner = text[italic_open + 2 : italic_close]
|
||||||
spans_inner = parse_paired_formatting(
|
spans_inner = parse_paired_formatting(
|
||||||
text_inner, cite=cite, bold=bold, italic=False
|
text_inner, can_cite=can_cite, can_bold=can_bold, can_italic=False
|
||||||
)
|
)
|
||||||
italic = ItalicSpan(spans_inner)
|
italic = ItalicSpan(spans_inner)
|
||||||
return [*spans_before, italic, *spans_after]
|
return [*spans_before, italic, *spans_after]
|
||||||
|
@ -163,9 +179,15 @@ def parse_italic(
|
||||||
|
|
||||||
|
|
||||||
def parse_breaks(text: str) -> Spans:
|
def parse_breaks(text: str) -> Spans:
|
||||||
|
"""
|
||||||
|
Parses intra-paragraph line breaks.
|
||||||
|
"""
|
||||||
|
# Parse empty text into nothing
|
||||||
if not text:
|
if not text:
|
||||||
return []
|
return []
|
||||||
|
# Split on the line break mark appearing at the end of the line
|
||||||
splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
|
splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
|
||||||
|
# Put a LineBreak between each TextSpan
|
||||||
spans: Spans = [
|
spans: Spans = [
|
||||||
splits[i // 2] if i % 2 == 0 else LineBreak()
|
splits[i // 2] if i % 2 == 0 else LineBreak()
|
||||||
for i in range(0, 2 * len(splits) - 1)
|
for i in range(0, 2 * len(splits) - 1)
|
||||||
|
|
|
@ -0,0 +1,176 @@
|
||||||
|
from typing import Sequence
|
||||||
|
|
||||||
|
from amanuensis.parser.core import (
|
||||||
|
TextSpan,
|
||||||
|
LineBreak,
|
||||||
|
ParsedArticle,
|
||||||
|
BodyParagraph,
|
||||||
|
SignatureParagraph,
|
||||||
|
BoldSpan,
|
||||||
|
ItalicSpan,
|
||||||
|
CitationSpan,
|
||||||
|
Renderable,
|
||||||
|
SpanContainer,
|
||||||
|
RenderableVisitor,
|
||||||
|
Spans,
|
||||||
|
)
|
||||||
|
from amanuensis.parser.helpers import normalize_title, filesafe_title, titlesort
|
||||||
|
from amanuensis.parser.parsing import (
|
||||||
|
parse_breaks,
|
||||||
|
parse_paired_formatting,
|
||||||
|
parse_paragraph,
|
||||||
|
parse_raw_markdown,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def assert_types(spans: Spans, types: Sequence, loc=None):
|
||||||
|
"""
|
||||||
|
Asserts that a span list has the types specified.
|
||||||
|
Each element in `types` should be either a span type or a list. The first
|
||||||
|
element of the list is the container type and the remaining elements are the
|
||||||
|
content types.
|
||||||
|
"""
|
||||||
|
assert len(spans) == len(
|
||||||
|
types
|
||||||
|
), f"Unexpected type sequence length at loc {loc if loc else 'root'}"
|
||||||
|
i = -1
|
||||||
|
for span, span_type in zip(spans, types):
|
||||||
|
i += 1
|
||||||
|
i_loc = f"{loc}.{i}" if loc else f"{i}"
|
||||||
|
if isinstance(span_type, list):
|
||||||
|
assert isinstance(
|
||||||
|
span, SpanContainer
|
||||||
|
), f"Expected a span container at loc {i_loc}"
|
||||||
|
assert (
|
||||||
|
len(span.spans) == len(span_type) - 1
|
||||||
|
), f"Unexpected container size at loc {i_loc}"
|
||||||
|
assert isinstance(
|
||||||
|
span, span_type[0]
|
||||||
|
), f"Unexpected container type at loc {i_loc}"
|
||||||
|
assert_types(span.spans, span_type[1:], loc=i_loc)
|
||||||
|
else:
|
||||||
|
assert isinstance(span, Renderable), f"Expected a span at loc {i_loc}"
|
||||||
|
assert isinstance(span, span_type), f"Unexpected span type at loc {i_loc}"
|
||||||
|
|
||||||
|
|
||||||
|
def assert_text(spans: Spans, texts: Sequence, loc=None):
|
||||||
|
"""
|
||||||
|
Asserts that a span list has the inner text structure specified.
|
||||||
|
Each element in `texts` should be either a string or a list of the same.
|
||||||
|
"""
|
||||||
|
assert len(spans) == len(
|
||||||
|
texts
|
||||||
|
), f"Unexpected text sequence length at loc {loc if loc else 'root'}"
|
||||||
|
i = -1
|
||||||
|
for span, text in zip(spans, texts):
|
||||||
|
i += 1
|
||||||
|
i_loc = f"{loc}.{i}" if loc else f"{i}"
|
||||||
|
if isinstance(text, str):
|
||||||
|
assert isinstance(span, TextSpan), f"Expected a text span at loc {i_loc}"
|
||||||
|
assert span.innertext == text, f"Unexpected text at loc {i_loc}"
|
||||||
|
elif isinstance(text, list):
|
||||||
|
assert isinstance(
|
||||||
|
span, SpanContainer
|
||||||
|
), f"Expected a span container at loc {i_loc}"
|
||||||
|
assert_text(span.spans, text, loc=i_loc)
|
||||||
|
else:
|
||||||
|
assert isinstance(span, LineBreak), f"Expected a line break at loc {i_loc}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_breaks():
|
||||||
|
"""Test parsing for intra-pragraph line break"""
|
||||||
|
text: str
|
||||||
|
spans: Spans
|
||||||
|
|
||||||
|
# Only having a line break does nothing
|
||||||
|
text = "One\nTwo"
|
||||||
|
spans: Spans = parse_breaks(text)
|
||||||
|
assert_types(spans, [TextSpan])
|
||||||
|
assert_text(spans, [text])
|
||||||
|
|
||||||
|
# Having the mark causes the text to be split across it
|
||||||
|
text = r"One\\" + "\nTwo"
|
||||||
|
spans: Spans = parse_breaks(text)
|
||||||
|
assert_types(spans, [TextSpan, LineBreak, TextSpan])
|
||||||
|
assert_text(spans, ["One", None, "Two"])
|
||||||
|
|
||||||
|
# Multiple lines can be broken
|
||||||
|
text = r"One\\" + "\n" + r"Two\\" + "\nThree"
|
||||||
|
spans: Spans = parse_breaks(text)
|
||||||
|
assert_types(spans, [TextSpan, LineBreak, TextSpan, LineBreak, TextSpan])
|
||||||
|
assert_text(spans, ["One", None, "Two", None, "Three"])
|
||||||
|
|
||||||
|
# The mark must be at the end of the line
|
||||||
|
text = r"One\\ " + "\nTwo"
|
||||||
|
spans: Spans = parse_breaks(text)
|
||||||
|
assert_types(spans, (TextSpan,))
|
||||||
|
assert_text(spans, [text])
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_single_parse_pairs():
|
||||||
|
"""Test parsing for bold and italic marks"""
|
||||||
|
text: str
|
||||||
|
spans: Spans
|
||||||
|
|
||||||
|
# Empty pair marks should parse
|
||||||
|
text = "****"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(spans, [[BoldSpan]])
|
||||||
|
|
||||||
|
text = "////"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(spans, [[ItalicSpan]])
|
||||||
|
|
||||||
|
# Pair marks with text inside should parse
|
||||||
|
text = "**hello**"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(spans, [[BoldSpan, TextSpan]])
|
||||||
|
assert_text(spans, [["hello"]])
|
||||||
|
|
||||||
|
text = "//hello//"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(spans, [[ItalicSpan, TextSpan]])
|
||||||
|
assert_text(spans, [["hello"]])
|
||||||
|
|
||||||
|
# Text outside of pair marks should parse on the same level
|
||||||
|
text = "**hello** world"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
|
||||||
|
assert_text(spans, [["hello"], " world"])
|
||||||
|
|
||||||
|
text = "//hello// world"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(spans, [[ItalicSpan, TextSpan], TextSpan])
|
||||||
|
assert_text(spans, [["hello"], " world"])
|
||||||
|
|
||||||
|
# Text before, between, and after pair marks should parse
|
||||||
|
text = "In the **beginning** was //the// Word"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(
|
||||||
|
spans,
|
||||||
|
[TextSpan, [BoldSpan, TextSpan], TextSpan, [ItalicSpan, TextSpan], TextSpan],
|
||||||
|
)
|
||||||
|
assert_text(spans, ["In the ", ["beginning"], " was ", ["the"], " Word"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_nested_parse_pairs():
|
||||||
|
"""Test parsing for nesting bold and italic"""
|
||||||
|
text: str
|
||||||
|
spans: Spans
|
||||||
|
|
||||||
|
# Simple nested test cases
|
||||||
|
text = "**//hello//**"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(spans, [[BoldSpan, [ItalicSpan, TextSpan]]])
|
||||||
|
assert_text(spans, [[["hello"]]])
|
||||||
|
|
||||||
|
text = "//**world**//"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(spans, [[ItalicSpan, [BoldSpan, TextSpan]]])
|
||||||
|
assert_text(spans, [[["world"]]])
|
||||||
|
|
||||||
|
# Overlap should only parse the first
|
||||||
|
text = "**Hello//world**//"
|
||||||
|
spans = parse_paired_formatting(text)
|
||||||
|
assert_types(spans, [[BoldSpan, TextSpan], TextSpan])
|
||||||
|
assert_text(spans, [["Hello//world"], "//"])
|
Loading…
Reference in New Issue