Incorporate parser into new code #12

Merged
Jaculabilis merged 8 commits from tvb/parser into develop 2021-06-12 17:28:19 +00:00
6 changed files with 280 additions and 234 deletions
Showing only changes of commit 1c55d866a8 - Show all commits

View File

@ -2,13 +2,14 @@
Module encapsulating all markdown parsing functionality. Module encapsulating all markdown parsing functionality.
""" """
from .core import normalize_title from .core import RenderableVisitor
from .helpers import titlesort, filesafe_title from .helpers import normalize_title, filesafe_title, titlesort
from .parsing import parse_raw_markdown from .parsing import parse_raw_markdown
__all__ = [ __all__ = [
normalize_title.__name__, "RenderableVisitor",
titlesort.__name__, "normalize_title",
filesafe_title.__name__, "filesafe_title",
parse_raw_markdown.__name__, "titlesort",
"parse_raw_markdown",
] ]

View File

@ -5,32 +5,26 @@ which can be operated on by a visitor defining functions that hook off
of the different token types. of the different token types.
""" """
import re
from typing import Callable, Any, Sequence from typing import Callable, Any, Sequence
RenderHook = Callable[['Renderable'], Any] from .helpers import normalize_title
Spans = Sequence['Renderable']
def normalize_title(title: str) -> str: RenderHook = Callable[["Renderable"], Any]
""" Spans = Sequence["Renderable"]
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r'\s+', " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
class Renderable(): class Renderable:
""" """
Base class for parsed markdown. Provides the `render()` method for Base class for parsed markdown. Provides the `render()` method for
visiting the token tree. visiting the token tree.
""" """
def render(self: 'Renderable', renderer: 'RenderableVisitor'):
def render(self: "Renderable", renderer: "RenderableVisitor"):
""" """
Execute the apppropriate visitor method on this Renderable. Execute the apppropriate visitor method on this Renderable.
Visitors implement hooks by declaring methods whose names are
the name of a Renderable class.
""" """
hook: RenderHook = getattr(renderer, type(self).__name__, None) hook: RenderHook = getattr(renderer, type(self).__name__, None)
if hook: if hook:
@ -39,7 +33,8 @@ class Renderable():
class TextSpan(Renderable): class TextSpan(Renderable):
"""An unstyled length of text.""" """A length of text."""
def __init__(self, innertext: str): def __init__(self, innertext: str):
self.innertext = innertext self.innertext = innertext
@ -49,20 +44,24 @@ class TextSpan(Renderable):
class LineBreak(Renderable): class LineBreak(Renderable):
"""A line break within a paragraph.""" """A line break within a paragraph."""
def __str__(self): def __str__(self):
return "<break>" return "<break>"
class SpanContainer(Renderable): class SpanContainer(Renderable):
"""A formatting element that wraps some amount of text.""" """A formatting element that wraps some amount of text."""
def __init__(self, spans: Spans): def __init__(self, spans: Spans):
self.spans: Spans = spans self.spans: Spans = spans
def __str__(self): def __str__(self):
return (f'[{type(self).__name__} ' return (
+ f'{" ".join([str(span) for span in self.spans])}]') f"[{type(self).__name__} "
+ f'{" ".join([str(span) for span in self.spans])}]'
)
def recurse(self, renderer: 'RenderableVisitor'): def recurse(self, renderer: "RenderableVisitor"):
return [child.render(renderer) for child in self.spans] return [child.render(renderer) for child in self.spans]
@ -88,22 +87,26 @@ class ItalicSpan(SpanContainer):
class CitationSpan(SpanContainer): class CitationSpan(SpanContainer):
"""A citation to another article.""" """A citation to another article."""
def __init__(self, spans: Spans, cite_target: str): def __init__(self, spans: Spans, cite_target: str):
super().__init__(spans) super().__init__(spans)
# Normalize citation target on parse, since we don't want # Normalize citation target on parse, since we don't want
# abnormal title strings lying around causing trouble. # abnormal title strings lying around causing trouble.
self.cite_target: str = normalize_title(cite_target) self.cite_target: str = normalize_title(cite_target)
def __str__(self): def __str__(self) -> str:
return (f'{{{" ".join([str(span) for span in self.spans])}' return (
+ f':{self.cite_target}}}') f'{{{" ".join([str(span) for span in self.spans])}'
+ f":{self.cite_target}}}"
)
class RenderableVisitor(): class RenderableVisitor:
""" """
Default implementation of the visitor pattern. Executes once on Default implementation of the visitor pattern. Executes once on
each token in the tree and returns itself. each token in the tree and returns itself.
""" """
def TextSpan(self, span: TextSpan): def TextSpan(self, span: TextSpan):
return self return self

View File

@ -1,7 +1,22 @@
"""
Helper functions for manipulating titles during parsing
"""
import re import re
import urllib.parse import urllib.parse
def normalize_title(title: str) -> str:
"""
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r"\s+", " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
def titlesort(title: str) -> str: def titlesort(title: str) -> str:
""" """
Strips articles off of titles for alphabetical sorting purposes Strips articles off of titles for alphabetical sorting purposes
@ -20,9 +35,19 @@ def filesafe_title(title: str) -> str:
""" """
Makes an article title filename-safe. Makes an article title filename-safe.
""" """
s = re.sub(r"\s+", '_', title) # Replace whitespace with _ # Replace whitespace with _
s = re.sub(r"~", '-', s) # parse.quote doesn't catch ~ s = re.sub(r"\s+", "_", title)
s = urllib.parse.quote(s) # Encode all other characters
s = re.sub(r"%", "", s) # Strip encoding %s # parse.quote doesn't catch ~
s = s[:64] # Limit to 64 characters s = re.sub(r"~", "-", s)
# Encode all other characters
s = urllib.parse.quote(s)
# Strip encoding %s
s = re.sub(r"%", "", s)
# Limit to 64 characters
s = s[:64]
return s return s

View File

@ -16,7 +16,7 @@ from .core import (
ItalicSpan, ItalicSpan,
CitationSpan, CitationSpan,
Renderable, Renderable,
SpanContainer SpanContainer,
) )
Spans = Sequence[Renderable] Spans = Sequence[Renderable]
@ -28,7 +28,7 @@ def parse_raw_markdown(text: str) -> ParsedArticle:
""" """
# Parse each paragraph individually, as no formatting applies # Parse each paragraph individually, as no formatting applies
# across paragraphs # across paragraphs
paragraphs = re.split(r'\n\n+', text) paragraphs = re.split(r"\n\n+", text)
parse_results = list(map(parse_paragraph, paragraphs)) parse_results = list(map(parse_paragraph, paragraphs))
return ParsedArticle(parse_results) return ParsedArticle(parse_results)
@ -36,7 +36,7 @@ def parse_raw_markdown(text: str) -> ParsedArticle:
def parse_paragraph(text: str) -> SpanContainer: def parse_paragraph(text: str) -> SpanContainer:
# Parse the paragraph as a span of text # Parse the paragraph as a span of text
text = text.strip() text = text.strip()
if text and text[0] == '~': if text and text[0] == "~":
return SignatureParagraph(parse_paired_formatting(text[1:])) return SignatureParagraph(parse_paired_formatting(text[1:]))
else: else:
return BodyParagraph(parse_paired_formatting(text)) return BodyParagraph(parse_paired_formatting(text))
@ -46,7 +46,8 @@ def parse_paired_formatting(
text: str, text: str,
cite: bool = True, cite: bool = True,
bold: bool = True, bold: bool = True,
italic: bool = True) -> Spans: italic: bool = True,
) -> Spans:
# Find positions of any paired formatting # Find positions of any paired formatting
first_cite = find_pair(text, "[[", "]]", cite) first_cite = find_pair(text, "[[", "]]", cite)
first_bold = find_pair(text, "**", "**", bold) first_bold = find_pair(text, "**", "**", bold)
@ -68,7 +69,8 @@ def find_pair(
text: str, text: str,
open_tag: str, open_tag: str,
close_tag: str, close_tag: str,
valid: bool) -> int: valid: bool,
) -> int:
# If skipping, return -1 # If skipping, return -1
if not valid: if not valid:
return -1 return -1
@ -84,7 +86,11 @@ def find_pair(
return first return first
def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans: def parse_citation(
text: str,
bold: bool = True,
italic: bool = True,
) -> Spans:
cite_open = text.find("[[") cite_open = text.find("[[")
if cite_open > -1: if cite_open > -1:
cite_close = text.find("]]", cite_open + 2) cite_close = text.find("]]", cite_open + 2)
@ -94,51 +100,62 @@ def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans:
spans_before = parse_breaks(text[:cite_open]) spans_before = parse_breaks(text[:cite_open])
# Continue parsing pair formatting after this one closes with all # Continue parsing pair formatting after this one closes with all
# three as valid choices # three as valid choices
spans_after = parse_paired_formatting(text[cite_close + 2:]) spans_after = parse_paired_formatting(text[cite_close + 2 :])
# Parse inner text and skip parsing for this format pair # Parse inner text and skip parsing for this format pair
text_inner = text[cite_open + 2:cite_close] text_inner = text[cite_open + 2 : cite_close]
# For citations specifically, we may need to split off a citation # For citations specifically, we may need to split off a citation
# target from the alias text # target from the alias text
inner_split = text_inner.split("|", 1) inner_split = text_inner.split("|", 1)
text_inner_actual, cite_target = inner_split[0], inner_split[-1] text_inner_actual, cite_target = inner_split[0], inner_split[-1]
spans_inner = parse_paired_formatting(text_inner_actual, spans_inner = parse_paired_formatting(
cite=False, bold=bold, italic=italic) text_inner_actual, cite=False, bold=bold, italic=italic
)
citation = CitationSpan(spans_inner, cite_target) citation = CitationSpan(spans_inner, cite_target)
return [*spans_before, citation, *spans_after] return [*spans_before, citation, *spans_after]
# Should never happen # Should never happen
return parse_breaks(text) return parse_breaks(text)
def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans: def parse_bold(
text: str,
cite: bool = True,
italic: bool = True,
) -> Spans:
bold_open = text.find("**") bold_open = text.find("**")
if bold_open > -1: if bold_open > -1:
bold_close = text.find("**", bold_open + 2) bold_close = text.find("**", bold_open + 2)
# Should be no formatting behind us # Should be no formatting behind us
spans_before = parse_breaks(text[:bold_open]) spans_before = parse_breaks(text[:bold_open])
# Freely parse formatting after us # Freely parse formatting after us
spans_after = parse_paired_formatting(text[bold_close + 2:]) spans_after = parse_paired_formatting(text[bold_close + 2 :])
# Parse inner text minus bold parsing # Parse inner text minus bold parsing
text_inner = text[bold_open + 2:bold_close] text_inner = text[bold_open + 2 : bold_close]
spans_inner = parse_paired_formatting(text_inner, spans_inner = parse_paired_formatting(
cite=cite, bold=False, italic=italic) text_inner, cite=cite, bold=False, italic=italic
)
bold = BoldSpan(spans_inner) bold = BoldSpan(spans_inner)
return [*spans_before, bold, *spans_after] return [*spans_before, bold, *spans_after]
# Should never happen # Should never happen
return parse_italic(text) return parse_italic(text)
def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans: def parse_italic(
text: str,
cite: bool = True,
bold: bool = True,
) -> Spans:
italic_open = text.find("//") italic_open = text.find("//")
if italic_open > -1: if italic_open > -1:
italic_close = text.find("//", italic_open + 2) italic_close = text.find("//", italic_open + 2)
# Should be no formatting behind us # Should be no formatting behind us
spans_before = parse_breaks(text[:italic_open]) spans_before = parse_breaks(text[:italic_open])
# Freely parse formatting after us # Freely parse formatting after us
spans_after = parse_paired_formatting(text[italic_close + 2:]) spans_after = parse_paired_formatting(text[italic_close + 2 :])
# Parse inner text minus italic parsing # Parse inner text minus italic parsing
text_inner = text[italic_open + 2:italic_close] text_inner = text[italic_open + 2 : italic_close]
spans_inner = parse_paired_formatting(text_inner, spans_inner = parse_paired_formatting(
cite=cite, bold=bold, italic=False) text_inner, cite=cite, bold=bold, italic=False
)
italic = ItalicSpan(spans_inner) italic = ItalicSpan(spans_inner)
return [*spans_before, italic, *spans_after] return [*spans_before, italic, *spans_after]
# Should never happen # Should never happen

View File

@ -1,4 +1,4 @@
[mypy] [mypy]
ignore_missing_imports = true ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
; mypy stable doesn't support pyproject.toml yet ; mypy stable doesn't support pyproject.toml yet

View File

@ -17,11 +17,11 @@ black = "^21.5b2"
mypy = "^0.812" mypy = "^0.812"
[tool.black] [tool.black]
extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/parser/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py" extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
[tool.mypy] [tool.mypy]
ignore_missing_imports = true ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py" exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
[tool.pytest.ini_options] [tool.pytest.ini_options]
addopts = "--show-capture=log" addopts = "--show-capture=log"