Incorporate parser into new code #12

Merged
Jaculabilis merged 8 commits from tvb/parser into develop 2021-06-12 17:28:19 +00:00
6 changed files with 280 additions and 234 deletions
Showing only changes of commit 1c55d866a8 - Show all commits

View File

@ -2,13 +2,14 @@
Module encapsulating all markdown parsing functionality. Module encapsulating all markdown parsing functionality.
""" """
from .core import normalize_title from .core import RenderableVisitor
from .helpers import titlesort, filesafe_title from .helpers import normalize_title, filesafe_title, titlesort
from .parsing import parse_raw_markdown from .parsing import parse_raw_markdown
__all__ = [ __all__ = [
normalize_title.__name__, "RenderableVisitor",
titlesort.__name__, "normalize_title",
filesafe_title.__name__, "filesafe_title",
parse_raw_markdown.__name__, "titlesort",
"parse_raw_markdown",
] ]

View File

@ -5,131 +5,134 @@ which can be operated on by a visitor defining functions that hook off
of the different token types. of the different token types.
""" """
import re
from typing import Callable, Any, Sequence from typing import Callable, Any, Sequence
RenderHook = Callable[['Renderable'], Any] from .helpers import normalize_title
Spans = Sequence['Renderable']
def normalize_title(title: str) -> str: RenderHook = Callable[["Renderable"], Any]
""" Spans = Sequence["Renderable"]
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r'\s+', " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
class Renderable(): class Renderable:
""" """
Base class for parsed markdown. Provides the `render()` method for Base class for parsed markdown. Provides the `render()` method for
visiting the token tree. visiting the token tree.
""" """
def render(self: 'Renderable', renderer: 'RenderableVisitor'):
""" def render(self: "Renderable", renderer: "RenderableVisitor"):
Execute the apppropriate visitor method on this Renderable. """
""" Execute the apppropriate visitor method on this Renderable.
hook: RenderHook = getattr(renderer, type(self).__name__, None) Visitors implement hooks by declaring methods whose names are
if hook: the name of a Renderable class.
return hook(self) """
return None hook: RenderHook = getattr(renderer, type(self).__name__, None)
if hook:
return hook(self)
return None
class TextSpan(Renderable): class TextSpan(Renderable):
"""An unstyled length of text.""" """A length of text."""
def __init__(self, innertext: str):
self.innertext = innertext
def __str__(self): def __init__(self, innertext: str):
return f"[{self.innertext}]" self.innertext = innertext
def __str__(self):
return f"[{self.innertext}]"
class LineBreak(Renderable): class LineBreak(Renderable):
"""A line break within a paragraph.""" """A line break within a paragraph."""
def __str__(self):
return "<break>" def __str__(self):
return "<break>"
class SpanContainer(Renderable): class SpanContainer(Renderable):
"""A formatting element that wraps some amount of text.""" """A formatting element that wraps some amount of text."""
def __init__(self, spans: Spans):
self.spans: Spans = spans
def __str__(self): def __init__(self, spans: Spans):
return (f'[{type(self).__name__} ' self.spans: Spans = spans
+ f'{" ".join([str(span) for span in self.spans])}]')
def recurse(self, renderer: 'RenderableVisitor'): def __str__(self):
return [child.render(renderer) for child in self.spans] return (
f"[{type(self).__name__} "
+ f'{" ".join([str(span) for span in self.spans])}]'
)
def recurse(self, renderer: "RenderableVisitor"):
return [child.render(renderer) for child in self.spans]
class ParsedArticle(SpanContainer): class ParsedArticle(SpanContainer):
"""Token tree root node, containing some number of paragraph tokens.""" """Token tree root node, containing some number of paragraph tokens."""
class BodyParagraph(SpanContainer): class BodyParagraph(SpanContainer):
"""A normal paragraph.""" """A normal paragraph."""
class SignatureParagraph(SpanContainer): class SignatureParagraph(SpanContainer):
"""A paragraph preceded by a signature mark.""" """A paragraph preceded by a signature mark."""
class BoldSpan(SpanContainer): class BoldSpan(SpanContainer):
"""A span of text inside bold marks.""" """A span of text inside bold marks."""
class ItalicSpan(SpanContainer): class ItalicSpan(SpanContainer):
"""A span of text inside italic marks.""" """A span of text inside italic marks."""
class CitationSpan(SpanContainer): class CitationSpan(SpanContainer):
"""A citation to another article.""" """A citation to another article."""
def __init__(self, spans: Spans, cite_target: str):
super().__init__(spans)
# Normalize citation target on parse, since we don't want
# abnormal title strings lying around causing trouble.
self.cite_target: str = normalize_title(cite_target)
def __str__(self): def __init__(self, spans: Spans, cite_target: str):
return (f'{{{" ".join([str(span) for span in self.spans])}' super().__init__(spans)
+ f':{self.cite_target}}}') # Normalize citation target on parse, since we don't want
# abnormal title strings lying around causing trouble.
self.cite_target: str = normalize_title(cite_target)
def __str__(self) -> str:
return (
f'{{{" ".join([str(span) for span in self.spans])}'
+ f":{self.cite_target}}}"
)
class RenderableVisitor(): class RenderableVisitor:
""" """
Default implementation of the visitor pattern. Executes once on Default implementation of the visitor pattern. Executes once on
each token in the tree and returns itself. each token in the tree and returns itself.
""" """
def TextSpan(self, span: TextSpan):
return self
def LineBreak(self, span: LineBreak): def TextSpan(self, span: TextSpan):
return self return self
def ParsedArticle(self, span: ParsedArticle): def LineBreak(self, span: LineBreak):
span.recurse(self) return self
return self
def BodyParagraph(self, span: BodyParagraph): def ParsedArticle(self, span: ParsedArticle):
span.recurse(self) span.recurse(self)
return self return self
def SignatureParagraph(self, span: SignatureParagraph): def BodyParagraph(self, span: BodyParagraph):
span.recurse(self) span.recurse(self)
return self return self
def BoldSpan(self, span: BoldSpan): def SignatureParagraph(self, span: SignatureParagraph):
span.recurse(self) span.recurse(self)
return self return self
def ItalicSpan(self, span: ItalicSpan): def BoldSpan(self, span: BoldSpan):
span.recurse(self) span.recurse(self)
return self return self
def CitationSpan(self, span: CitationSpan): def ItalicSpan(self, span: ItalicSpan):
span.recurse(self) span.recurse(self)
return self return self
def CitationSpan(self, span: CitationSpan):
span.recurse(self)
return self

View File

@ -1,28 +1,53 @@
"""
Helper functions for manipulating titles during parsing
"""
import re import re
import urllib.parse import urllib.parse
def normalize_title(title: str) -> str:
"""
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r"\s+", " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
def titlesort(title: str) -> str: def titlesort(title: str) -> str:
""" """
Strips articles off of titles for alphabetical sorting purposes Strips articles off of titles for alphabetical sorting purposes
""" """
lower = title.lower() lower = title.lower()
if lower.startswith("the "): if lower.startswith("the "):
return lower[4:] return lower[4:]
if lower.startswith("an "): if lower.startswith("an "):
return lower[3:] return lower[3:]
if lower.startswith("a "): if lower.startswith("a "):
return lower[2:] return lower[2:]
return lower return lower
def filesafe_title(title: str) -> str: def filesafe_title(title: str) -> str:
""" """
Makes an article title filename-safe. Makes an article title filename-safe.
""" """
s = re.sub(r"\s+", '_', title) # Replace whitespace with _ # Replace whitespace with _
s = re.sub(r"~", '-', s) # parse.quote doesn't catch ~ s = re.sub(r"\s+", "_", title)
s = urllib.parse.quote(s) # Encode all other characters
s = re.sub(r"%", "", s) # Strip encoding %s # parse.quote doesn't catch ~
s = s[:64] # Limit to 64 characters s = re.sub(r"~", "-", s)
return s
# Encode all other characters
s = urllib.parse.quote(s)
# Strip encoding %s
s = re.sub(r"%", "", s)
# Limit to 64 characters
s = s[:64]
return s

View File

@ -7,150 +7,167 @@ import re
from typing import Sequence from typing import Sequence
from .core import ( from .core import (
TextSpan, TextSpan,
LineBreak, LineBreak,
ParsedArticle, ParsedArticle,
BodyParagraph, BodyParagraph,
SignatureParagraph, SignatureParagraph,
BoldSpan, BoldSpan,
ItalicSpan, ItalicSpan,
CitationSpan, CitationSpan,
Renderable, Renderable,
SpanContainer SpanContainer,
) )
Spans = Sequence[Renderable] Spans = Sequence[Renderable]
def parse_raw_markdown(text: str) -> ParsedArticle: def parse_raw_markdown(text: str) -> ParsedArticle:
""" """
Parses a body of Lexipython markdown into a Renderable tree. Parses a body of Lexipython markdown into a Renderable tree.
""" """
# Parse each paragraph individually, as no formatting applies # Parse each paragraph individually, as no formatting applies
# across paragraphs # across paragraphs
paragraphs = re.split(r'\n\n+', text) paragraphs = re.split(r"\n\n+", text)
parse_results = list(map(parse_paragraph, paragraphs)) parse_results = list(map(parse_paragraph, paragraphs))
return ParsedArticle(parse_results) return ParsedArticle(parse_results)
def parse_paragraph(text: str) -> SpanContainer: def parse_paragraph(text: str) -> SpanContainer:
# Parse the paragraph as a span of text # Parse the paragraph as a span of text
text = text.strip() text = text.strip()
if text and text[0] == '~': if text and text[0] == "~":
return SignatureParagraph(parse_paired_formatting(text[1:])) return SignatureParagraph(parse_paired_formatting(text[1:]))
else: else:
return BodyParagraph(parse_paired_formatting(text)) return BodyParagraph(parse_paired_formatting(text))
def parse_paired_formatting( def parse_paired_formatting(
text: str, text: str,
cite: bool = True, cite: bool = True,
bold: bool = True, bold: bool = True,
italic: bool = True) -> Spans: italic: bool = True,
# Find positions of any paired formatting ) -> Spans:
first_cite = find_pair(text, "[[", "]]", cite) # Find positions of any paired formatting
first_bold = find_pair(text, "**", "**", bold) first_cite = find_pair(text, "[[", "]]", cite)
first_italic = find_pair(text, "//", "//", italic) first_bold = find_pair(text, "**", "**", bold)
# Load the possible parse handlers into the map first_italic = find_pair(text, "//", "//", italic)
handlers = {} # Load the possible parse handlers into the map
handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic) handlers = {}
handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic) handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic)
handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold) handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic)
# If nothing was found, move on to the next parsing step handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold)
handlers[-1] = lambda: parse_breaks(text) # If nothing was found, move on to the next parsing step
# Choose a handler based on the earliest found result handlers[-1] = lambda: parse_breaks(text)
finds = [i for i in (first_cite, first_bold, first_italic) if i > -1] # Choose a handler based on the earliest found result
first = min(finds) if finds else -1 finds = [i for i in (first_cite, first_bold, first_italic) if i > -1]
return handlers[first]() first = min(finds) if finds else -1
return handlers[first]()
def find_pair( def find_pair(
text: str, text: str,
open_tag: str, open_tag: str,
close_tag: str, close_tag: str,
valid: bool) -> int: valid: bool,
# If skipping, return -1 ) -> int:
if not valid: # If skipping, return -1
return -1 if not valid:
# If the open tag wasn't found, return -1 return -1
first = text.find(open_tag) # If the open tag wasn't found, return -1
if first < 0: first = text.find(open_tag)
return -1 if first < 0:
# If the close tag wasn't found after the open tag, return -1 return -1
second = text.find(close_tag, first + len(open_tag)) # If the close tag wasn't found after the open tag, return -1
if second < 0: second = text.find(close_tag, first + len(open_tag))
return -1 if second < 0:
# Otherwise, the pair exists return -1
return first # Otherwise, the pair exists
return first
def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans: def parse_citation(
cite_open = text.find("[[") text: str,
if cite_open > -1: bold: bool = True,
cite_close = text.find("]]", cite_open + 2) italic: bool = True,
# Since we searched for pairs from the beginning, there should be no ) -> Spans:
# undetected pair formatting before this one, so move to the next cite_open = text.find("[[")
# level of parsing if cite_open > -1:
spans_before = parse_breaks(text[:cite_open]) cite_close = text.find("]]", cite_open + 2)
# Continue parsing pair formatting after this one closes with all # Since we searched for pairs from the beginning, there should be no
# three as valid choices # undetected pair formatting before this one, so move to the next
spans_after = parse_paired_formatting(text[cite_close + 2:]) # level of parsing
# Parse inner text and skip parsing for this format pair spans_before = parse_breaks(text[:cite_open])
text_inner = text[cite_open + 2:cite_close] # Continue parsing pair formatting after this one closes with all
# For citations specifically, we may need to split off a citation # three as valid choices
# target from the alias text spans_after = parse_paired_formatting(text[cite_close + 2 :])
inner_split = text_inner.split("|", 1) # Parse inner text and skip parsing for this format pair
text_inner_actual, cite_target = inner_split[0], inner_split[-1] text_inner = text[cite_open + 2 : cite_close]
spans_inner = parse_paired_formatting(text_inner_actual, # For citations specifically, we may need to split off a citation
cite=False, bold=bold, italic=italic) # target from the alias text
citation = CitationSpan(spans_inner, cite_target) inner_split = text_inner.split("|", 1)
return [*spans_before, citation, *spans_after] text_inner_actual, cite_target = inner_split[0], inner_split[-1]
# Should never happen spans_inner = parse_paired_formatting(
return parse_breaks(text) text_inner_actual, cite=False, bold=bold, italic=italic
)
citation = CitationSpan(spans_inner, cite_target)
return [*spans_before, citation, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans: def parse_bold(
bold_open = text.find("**") text: str,
if bold_open > -1: cite: bool = True,
bold_close = text.find("**", bold_open + 2) italic: bool = True,
# Should be no formatting behind us ) -> Spans:
spans_before = parse_breaks(text[:bold_open]) bold_open = text.find("**")
# Freely parse formatting after us if bold_open > -1:
spans_after = parse_paired_formatting(text[bold_close + 2:]) bold_close = text.find("**", bold_open + 2)
# Parse inner text minus bold parsing # Should be no formatting behind us
text_inner = text[bold_open + 2:bold_close] spans_before = parse_breaks(text[:bold_open])
spans_inner = parse_paired_formatting(text_inner, # Freely parse formatting after us
cite=cite, bold=False, italic=italic) spans_after = parse_paired_formatting(text[bold_close + 2 :])
bold = BoldSpan(spans_inner) # Parse inner text minus bold parsing
return [*spans_before, bold, *spans_after] text_inner = text[bold_open + 2 : bold_close]
# Should never happen spans_inner = parse_paired_formatting(
return parse_italic(text) text_inner, cite=cite, bold=False, italic=italic
)
bold = BoldSpan(spans_inner)
return [*spans_before, bold, *spans_after]
# Should never happen
return parse_italic(text)
def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans: def parse_italic(
italic_open = text.find("//") text: str,
if italic_open > -1: cite: bool = True,
italic_close = text.find("//", italic_open + 2) bold: bool = True,
# Should be no formatting behind us ) -> Spans:
spans_before = parse_breaks(text[:italic_open]) italic_open = text.find("//")
# Freely parse formatting after us if italic_open > -1:
spans_after = parse_paired_formatting(text[italic_close + 2:]) italic_close = text.find("//", italic_open + 2)
# Parse inner text minus italic parsing # Should be no formatting behind us
text_inner = text[italic_open + 2:italic_close] spans_before = parse_breaks(text[:italic_open])
spans_inner = parse_paired_formatting(text_inner, # Freely parse formatting after us
cite=cite, bold=bold, italic=False) spans_after = parse_paired_formatting(text[italic_close + 2 :])
italic = ItalicSpan(spans_inner) # Parse inner text minus italic parsing
return [*spans_before, italic, *spans_after] text_inner = text[italic_open + 2 : italic_close]
# Should never happen spans_inner = parse_paired_formatting(
return parse_breaks(text) text_inner, cite=cite, bold=bold, italic=False
)
italic = ItalicSpan(spans_inner)
return [*spans_before, italic, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_breaks(text: str) -> Spans: def parse_breaks(text: str) -> Spans:
if not text: if not text:
return [] return []
splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
spans: Spans = [ spans: Spans = [
splits[i // 2] if i % 2 == 0 else LineBreak() splits[i // 2] if i % 2 == 0 else LineBreak()
for i in range(0, 2 * len(splits) - 1) for i in range(0, 2 * len(splits) - 1)
] ]
return spans return spans

View File

@ -1,4 +1,4 @@
[mypy] [mypy]
ignore_missing_imports = true ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
; mypy stable doesn't support pyproject.toml yet ; mypy stable doesn't support pyproject.toml yet

View File

@ -17,11 +17,11 @@ black = "^21.5b2"
mypy = "^0.812" mypy = "^0.812"
[tool.black] [tool.black]
extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/parser/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py" extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
[tool.mypy] [tool.mypy]
ignore_missing_imports = true ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
[tool.pytest.ini_options] [tool.pytest.ini_options]
addopts = "--show-capture=log" addopts = "--show-capture=log"