amanuensis/amanuensis/parser/parsing.py

213 lines
6.7 KiB
Python

"""
Internal module encapsulating a recursive descent parser for
Lexipython markdown.
"""
import re
from typing import Sequence
from .core import (
TextSpan,
LineBreak,
ParsedArticle,
BodyParagraph,
SignatureParagraph,
BoldSpan,
ItalicSpan,
CitationSpan,
Renderable,
SpanContainer,
)
Spans = Sequence[Renderable]
def parse_raw_markdown(text: str) -> ParsedArticle:
"""
Parses a body of Lexipython markdown into a Renderable tree.
"""
# Parse each paragraph individually, as no formatting applies
# across paragraphs
paragraphs = re.split(r"\n\n+", text)
parse_results = list(map(parse_paragraph, paragraphs))
return ParsedArticle(parse_results)
def parse_paragraph(text: str) -> SpanContainer:
"""
Parses a block of text into a paragraph object.
"""
# Parse the paragraph as a span of text
text = text.strip()
if text and text[0] == "~":
return SignatureParagraph(parse_paired_formatting(text[1:]))
else:
return BodyParagraph(parse_paired_formatting(text))
def parse_paired_formatting(
text: str,
in_cite: bool = False,
in_bold: bool = False,
in_italic: bool = False,
) -> Spans:
"""
Parses citations, bolds, and italics, which can be nested inside each other.
A single type cannot nest inside itself, which is controlled by setting the
flag parameters to False.
"""
# Find positions of any paired formatting
next_cite = find_pair(text, "[[", "]]") if not in_cite else -1
next_bold = find_pair(text, "**", "**") if not in_bold else -1
next_italic = find_pair(text, "//", "//") if not in_italic else -1
# Create a map from a formatting mark's distance to its parse handler
handlers = {}
handlers[next_cite] = lambda: parse_citation(
text, in_bold=in_bold, in_italic=in_italic
)
handlers[next_bold] = lambda: parse_bold(
text, in_cite=in_cite, in_italic=in_italic
)
handlers[next_italic] = lambda: parse_italic(
text, in_cite=in_cite, in_bold=in_bold
)
# Map the next parsing step at -1. If we're currently inside a formatting
# mark pair, skip parsing line breaks, which are not allowed inside paired
# marks.
if in_cite or in_bold or in_italic:
handlers[-1] = lambda: parse_text(text)
else:
handlers[-1] = lambda: parse_breaks(text)
# Choose the handler for the earliest found pair, or the default handler
# at -1 if nothing was found.
finds = [i for i in (next_cite, next_bold, next_italic) if i > -1]
first = min(finds) if finds else -1
return handlers[first]()
def find_pair(text: str, open_tag: str, close_tag: str) -> int:
"""
Finds the beginning of a pair of formatting marks.
"""
# If the open tag wasn't found, return -1
first = text.find(open_tag)
if first < 0:
return -1
# If the close tag wasn't found after the open tag, return -1
second = text.find(close_tag, first + len(open_tag))
if second < 0:
return -1
# Otherwise, the pair exists
return first
def parse_citation(
text: str,
in_bold: bool = False,
in_italic: bool = False,
) -> Spans:
"""
Parses text into a citation span.
"""
cite_open = text.find("[[")
if cite_open > -1:
cite_close = text.find("]]", cite_open + 2)
# Since we searched for pairs from the beginning, there should be no
# undetected pair formatting before this one, so move to the next
# level of parsing
spans_before = parse_breaks(text[:cite_open])
# Continue parsing pair formatting after this one closes with all
# three as valid choices
spans_after = parse_paired_formatting(text[cite_close + 2 :])
# Parse inner text and skip parsing for this format pair
text_inner = text[cite_open + 2 : cite_close]
# For citations specifically, we may need to split off a citation
# target from the alias text
inner_split = text_inner.split("|", 1)
text_inner_actual, cite_target = inner_split[0], inner_split[-1]
spans_inner = parse_paired_formatting(
text_inner_actual, in_cite=True, in_bold=in_bold, in_italic=in_italic
)
citation = CitationSpan(spans_inner, cite_target)
return [*spans_before, citation, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_bold(
text: str,
in_cite: bool = False,
in_italic: bool = False,
) -> Spans:
"""
Parses text into a bold span.
"""
bold_open = text.find("**")
if bold_open > -1:
bold_close = text.find("**", bold_open + 2)
# Should be no formatting behind us
spans_before = parse_breaks(text[:bold_open])
# Freely parse formatting after us
spans_after = parse_paired_formatting(text[bold_close + 2 :])
# Parse inner text minus bold parsing
text_inner = text[bold_open + 2 : bold_close]
spans_inner = parse_paired_formatting(
text_inner, in_cite=in_cite, in_bold=True, in_italic=in_italic
)
bold = BoldSpan(spans_inner)
return [*spans_before, bold, *spans_after]
# Should never happen
return parse_italic(text)
def parse_italic(
text: str,
in_cite: bool = False,
in_bold: bool = False,
) -> Spans:
"""
Parses text into an italic span.
"""
italic_open = text.find("//")
if italic_open > -1:
italic_close = text.find("//", italic_open + 2)
# Should be no formatting behind us
spans_before = parse_breaks(text[:italic_open])
# Freely parse formatting after us
spans_after = parse_paired_formatting(text[italic_close + 2 :])
# Parse inner text minus italic parsing
text_inner = text[italic_open + 2 : italic_close]
spans_inner = parse_paired_formatting(
text_inner, in_cite=in_cite, in_bold=in_bold, in_italic=True
)
italic = ItalicSpan(spans_inner)
return [*spans_before, italic, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_breaks(text: str) -> Spans:
"""
Parses intra-paragraph line breaks.
"""
# Parse empty text into nothing
if not text:
return []
# Split on the line break mark appearing at the end of the line
splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
# Put a LineBreak between each TextSpan
spans: Spans = [
splits[i // 2] if i % 2 == 0 else LineBreak()
for i in range(0, 2 * len(splits) - 1)
]
return spans
def parse_text(text: str) -> Spans:
"""
Parses text with no remaining parseable marks.
"""
if not text:
return []
return [TextSpan(text)]