Incorporate parser into new code #12

Merged
Jaculabilis merged 8 commits from tvb/parser into develop 2021-06-12 17:28:19 +00:00
6 changed files with 280 additions and 234 deletions
Showing only changes of commit 1c55d866a8 - Show all commits

View File

@ -2,13 +2,14 @@
Module encapsulating all markdown parsing functionality.
"""
from .core import normalize_title
from .helpers import titlesort, filesafe_title
from .core import RenderableVisitor
from .helpers import normalize_title, filesafe_title, titlesort
from .parsing import parse_raw_markdown
__all__ = [
normalize_title.__name__,
titlesort.__name__,
filesafe_title.__name__,
parse_raw_markdown.__name__,
"RenderableVisitor",
"normalize_title",
"filesafe_title",
"titlesort",
"parse_raw_markdown",
]

View File

@ -5,131 +5,134 @@ which can be operated on by a visitor defining functions that hook off
of the different token types.
"""
import re
from typing import Callable, Any, Sequence
RenderHook = Callable[['Renderable'], Any]
Spans = Sequence['Renderable']
from .helpers import normalize_title
def normalize_title(title: str) -> str:
"""
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r'\s+', " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
RenderHook = Callable[["Renderable"], Any]
Spans = Sequence["Renderable"]
class Renderable():
"""
Base class for parsed markdown. Provides the `render()` method for
visiting the token tree.
"""
def render(self: 'Renderable', renderer: 'RenderableVisitor'):
"""
Execute the apppropriate visitor method on this Renderable.
"""
hook: RenderHook = getattr(renderer, type(self).__name__, None)
if hook:
return hook(self)
return None
class Renderable:
"""
Base class for parsed markdown. Provides the `render()` method for
visiting the token tree.
"""
def render(self: "Renderable", renderer: "RenderableVisitor"):
"""
Execute the apppropriate visitor method on this Renderable.
Visitors implement hooks by declaring methods whose names are
the name of a Renderable class.
"""
hook: RenderHook = getattr(renderer, type(self).__name__, None)
if hook:
return hook(self)
return None
class TextSpan(Renderable):
"""An unstyled length of text."""
def __init__(self, innertext: str):
self.innertext = innertext
"""A length of text."""
def __str__(self):
return f"[{self.innertext}]"
def __init__(self, innertext: str):
self.innertext = innertext
def __str__(self):
return f"[{self.innertext}]"
class LineBreak(Renderable):
"""A line break within a paragraph."""
def __str__(self):
return "<break>"
"""A line break within a paragraph."""
def __str__(self):
return "<break>"
class SpanContainer(Renderable):
"""A formatting element that wraps some amount of text."""
def __init__(self, spans: Spans):
self.spans: Spans = spans
"""A formatting element that wraps some amount of text."""
def __str__(self):
return (f'[{type(self).__name__} '
+ f'{" ".join([str(span) for span in self.spans])}]')
def __init__(self, spans: Spans):
self.spans: Spans = spans
def recurse(self, renderer: 'RenderableVisitor'):
return [child.render(renderer) for child in self.spans]
def __str__(self):
return (
f"[{type(self).__name__} "
+ f'{" ".join([str(span) for span in self.spans])}]'
)
def recurse(self, renderer: "RenderableVisitor"):
return [child.render(renderer) for child in self.spans]
class ParsedArticle(SpanContainer):
"""Token tree root node, containing some number of paragraph tokens."""
"""Token tree root node, containing some number of paragraph tokens."""
class BodyParagraph(SpanContainer):
"""A normal paragraph."""
"""A normal paragraph."""
class SignatureParagraph(SpanContainer):
"""A paragraph preceded by a signature mark."""
"""A paragraph preceded by a signature mark."""
class BoldSpan(SpanContainer):
"""A span of text inside bold marks."""
"""A span of text inside bold marks."""
class ItalicSpan(SpanContainer):
"""A span of text inside italic marks."""
"""A span of text inside italic marks."""
class CitationSpan(SpanContainer):
"""A citation to another article."""
def __init__(self, spans: Spans, cite_target: str):
super().__init__(spans)
# Normalize citation target on parse, since we don't want
# abnormal title strings lying around causing trouble.
self.cite_target: str = normalize_title(cite_target)
"""A citation to another article."""
def __str__(self):
return (f'{{{" ".join([str(span) for span in self.spans])}'
+ f':{self.cite_target}}}')
def __init__(self, spans: Spans, cite_target: str):
super().__init__(spans)
# Normalize citation target on parse, since we don't want
# abnormal title strings lying around causing trouble.
self.cite_target: str = normalize_title(cite_target)
def __str__(self) -> str:
return (
f'{{{" ".join([str(span) for span in self.spans])}'
+ f":{self.cite_target}}}"
)
class RenderableVisitor():
"""
Default implementation of the visitor pattern. Executes once on
each token in the tree and returns itself.
"""
def TextSpan(self, span: TextSpan):
return self
class RenderableVisitor:
"""
Default implementation of the visitor pattern. Executes once on
each token in the tree and returns itself.
"""
def LineBreak(self, span: LineBreak):
return self
def TextSpan(self, span: TextSpan):
return self
def ParsedArticle(self, span: ParsedArticle):
span.recurse(self)
return self
def LineBreak(self, span: LineBreak):
return self
def BodyParagraph(self, span: BodyParagraph):
span.recurse(self)
return self
def ParsedArticle(self, span: ParsedArticle):
span.recurse(self)
return self
def SignatureParagraph(self, span: SignatureParagraph):
span.recurse(self)
return self
def BodyParagraph(self, span: BodyParagraph):
span.recurse(self)
return self
def BoldSpan(self, span: BoldSpan):
span.recurse(self)
return self
def SignatureParagraph(self, span: SignatureParagraph):
span.recurse(self)
return self
def ItalicSpan(self, span: ItalicSpan):
span.recurse(self)
return self
def BoldSpan(self, span: BoldSpan):
span.recurse(self)
return self
def CitationSpan(self, span: CitationSpan):
span.recurse(self)
return self
def ItalicSpan(self, span: ItalicSpan):
span.recurse(self)
return self
def CitationSpan(self, span: CitationSpan):
span.recurse(self)
return self

View File

@ -1,28 +1,53 @@
"""
Helper functions for manipulating titles during parsing
"""
import re
import urllib.parse
def normalize_title(title: str) -> str:
"""
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r"\s+", " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
def titlesort(title: str) -> str:
"""
Strips articles off of titles for alphabetical sorting purposes
"""
lower = title.lower()
if lower.startswith("the "):
return lower[4:]
if lower.startswith("an "):
return lower[3:]
if lower.startswith("a "):
return lower[2:]
return lower
"""
Strips articles off of titles for alphabetical sorting purposes
"""
lower = title.lower()
if lower.startswith("the "):
return lower[4:]
if lower.startswith("an "):
return lower[3:]
if lower.startswith("a "):
return lower[2:]
return lower
def filesafe_title(title: str) -> str:
"""
Makes an article title filename-safe.
"""
s = re.sub(r"\s+", '_', title) # Replace whitespace with _
s = re.sub(r"~", '-', s) # parse.quote doesn't catch ~
s = urllib.parse.quote(s) # Encode all other characters
s = re.sub(r"%", "", s) # Strip encoding %s
s = s[:64] # Limit to 64 characters
return s
"""
Makes an article title filename-safe.
"""
# Replace whitespace with _
s = re.sub(r"\s+", "_", title)
# parse.quote doesn't catch ~
s = re.sub(r"~", "-", s)
# Encode all other characters
s = urllib.parse.quote(s)
# Strip encoding %s
s = re.sub(r"%", "", s)
# Limit to 64 characters
s = s[:64]
return s

View File

@ -7,150 +7,167 @@ import re
from typing import Sequence
from .core import (
TextSpan,
LineBreak,
ParsedArticle,
BodyParagraph,
SignatureParagraph,
BoldSpan,
ItalicSpan,
CitationSpan,
Renderable,
SpanContainer
TextSpan,
LineBreak,
ParsedArticle,
BodyParagraph,
SignatureParagraph,
BoldSpan,
ItalicSpan,
CitationSpan,
Renderable,
SpanContainer,
)
Spans = Sequence[Renderable]
def parse_raw_markdown(text: str) -> ParsedArticle:
"""
Parses a body of Lexipython markdown into a Renderable tree.
"""
# Parse each paragraph individually, as no formatting applies
# across paragraphs
paragraphs = re.split(r'\n\n+', text)
parse_results = list(map(parse_paragraph, paragraphs))
return ParsedArticle(parse_results)
"""
Parses a body of Lexipython markdown into a Renderable tree.
"""
# Parse each paragraph individually, as no formatting applies
# across paragraphs
paragraphs = re.split(r"\n\n+", text)
parse_results = list(map(parse_paragraph, paragraphs))
return ParsedArticle(parse_results)
def parse_paragraph(text: str) -> SpanContainer:
# Parse the paragraph as a span of text
text = text.strip()
if text and text[0] == '~':
return SignatureParagraph(parse_paired_formatting(text[1:]))
else:
return BodyParagraph(parse_paired_formatting(text))
# Parse the paragraph as a span of text
text = text.strip()
if text and text[0] == "~":
return SignatureParagraph(parse_paired_formatting(text[1:]))
else:
return BodyParagraph(parse_paired_formatting(text))
def parse_paired_formatting(
text: str,
cite: bool = True,
bold: bool = True,
italic: bool = True) -> Spans:
# Find positions of any paired formatting
first_cite = find_pair(text, "[[", "]]", cite)
first_bold = find_pair(text, "**", "**", bold)
first_italic = find_pair(text, "//", "//", italic)
# Load the possible parse handlers into the map
handlers = {}
handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic)
handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic)
handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold)
# If nothing was found, move on to the next parsing step
handlers[-1] = lambda: parse_breaks(text)
# Choose a handler based on the earliest found result
finds = [i for i in (first_cite, first_bold, first_italic) if i > -1]
first = min(finds) if finds else -1
return handlers[first]()
text: str,
cite: bool = True,
bold: bool = True,
italic: bool = True,
) -> Spans:
# Find positions of any paired formatting
first_cite = find_pair(text, "[[", "]]", cite)
first_bold = find_pair(text, "**", "**", bold)
first_italic = find_pair(text, "//", "//", italic)
# Load the possible parse handlers into the map
handlers = {}
handlers[first_cite] = lambda: parse_citation(text, bold=bold, italic=italic)
handlers[first_bold] = lambda: parse_bold(text, cite=cite, italic=italic)
handlers[first_italic] = lambda: parse_italic(text, cite=cite, bold=bold)
# If nothing was found, move on to the next parsing step
handlers[-1] = lambda: parse_breaks(text)
# Choose a handler based on the earliest found result
finds = [i for i in (first_cite, first_bold, first_italic) if i > -1]
first = min(finds) if finds else -1
return handlers[first]()
def find_pair(
text: str,
open_tag: str,
close_tag: str,
valid: bool) -> int:
# If skipping, return -1
if not valid:
return -1
# If the open tag wasn't found, return -1
first = text.find(open_tag)
if first < 0:
return -1
# If the close tag wasn't found after the open tag, return -1
second = text.find(close_tag, first + len(open_tag))
if second < 0:
return -1
# Otherwise, the pair exists
return first
text: str,
open_tag: str,
close_tag: str,
valid: bool,
) -> int:
# If skipping, return -1
if not valid:
return -1
# If the open tag wasn't found, return -1
first = text.find(open_tag)
if first < 0:
return -1
# If the close tag wasn't found after the open tag, return -1
second = text.find(close_tag, first + len(open_tag))
if second < 0:
return -1
# Otherwise, the pair exists
return first
def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans:
cite_open = text.find("[[")
if cite_open > -1:
cite_close = text.find("]]", cite_open + 2)
# Since we searched for pairs from the beginning, there should be no
# undetected pair formatting before this one, so move to the next
# level of parsing
spans_before = parse_breaks(text[:cite_open])
# Continue parsing pair formatting after this one closes with all
# three as valid choices
spans_after = parse_paired_formatting(text[cite_close + 2:])
# Parse inner text and skip parsing for this format pair
text_inner = text[cite_open + 2:cite_close]
# For citations specifically, we may need to split off a citation
# target from the alias text
inner_split = text_inner.split("|", 1)
text_inner_actual, cite_target = inner_split[0], inner_split[-1]
spans_inner = parse_paired_formatting(text_inner_actual,
cite=False, bold=bold, italic=italic)
citation = CitationSpan(spans_inner, cite_target)
return [*spans_before, citation, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_citation(
text: str,
bold: bool = True,
italic: bool = True,
) -> Spans:
cite_open = text.find("[[")
if cite_open > -1:
cite_close = text.find("]]", cite_open + 2)
# Since we searched for pairs from the beginning, there should be no
# undetected pair formatting before this one, so move to the next
# level of parsing
spans_before = parse_breaks(text[:cite_open])
# Continue parsing pair formatting after this one closes with all
# three as valid choices
spans_after = parse_paired_formatting(text[cite_close + 2 :])
# Parse inner text and skip parsing for this format pair
text_inner = text[cite_open + 2 : cite_close]
# For citations specifically, we may need to split off a citation
# target from the alias text
inner_split = text_inner.split("|", 1)
text_inner_actual, cite_target = inner_split[0], inner_split[-1]
spans_inner = parse_paired_formatting(
text_inner_actual, cite=False, bold=bold, italic=italic
)
citation = CitationSpan(spans_inner, cite_target)
return [*spans_before, citation, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans:
bold_open = text.find("**")
if bold_open > -1:
bold_close = text.find("**", bold_open + 2)
# Should be no formatting behind us
spans_before = parse_breaks(text[:bold_open])
# Freely parse formatting after us
spans_after = parse_paired_formatting(text[bold_close + 2:])
# Parse inner text minus bold parsing
text_inner = text[bold_open + 2:bold_close]
spans_inner = parse_paired_formatting(text_inner,
cite=cite, bold=False, italic=italic)
bold = BoldSpan(spans_inner)
return [*spans_before, bold, *spans_after]
# Should never happen
return parse_italic(text)
def parse_bold(
text: str,
cite: bool = True,
italic: bool = True,
) -> Spans:
bold_open = text.find("**")
if bold_open > -1:
bold_close = text.find("**", bold_open + 2)
# Should be no formatting behind us
spans_before = parse_breaks(text[:bold_open])
# Freely parse formatting after us
spans_after = parse_paired_formatting(text[bold_close + 2 :])
# Parse inner text minus bold parsing
text_inner = text[bold_open + 2 : bold_close]
spans_inner = parse_paired_formatting(
text_inner, cite=cite, bold=False, italic=italic
)
bold = BoldSpan(spans_inner)
return [*spans_before, bold, *spans_after]
# Should never happen
return parse_italic(text)
def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans:
italic_open = text.find("//")
if italic_open > -1:
italic_close = text.find("//", italic_open + 2)
# Should be no formatting behind us
spans_before = parse_breaks(text[:italic_open])
# Freely parse formatting after us
spans_after = parse_paired_formatting(text[italic_close + 2:])
# Parse inner text minus italic parsing
text_inner = text[italic_open + 2:italic_close]
spans_inner = parse_paired_formatting(text_inner,
cite=cite, bold=bold, italic=False)
italic = ItalicSpan(spans_inner)
return [*spans_before, italic, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_italic(
text: str,
cite: bool = True,
bold: bool = True,
) -> Spans:
italic_open = text.find("//")
if italic_open > -1:
italic_close = text.find("//", italic_open + 2)
# Should be no formatting behind us
spans_before = parse_breaks(text[:italic_open])
# Freely parse formatting after us
spans_after = parse_paired_formatting(text[italic_close + 2 :])
# Parse inner text minus italic parsing
text_inner = text[italic_open + 2 : italic_close]
spans_inner = parse_paired_formatting(
text_inner, cite=cite, bold=bold, italic=False
)
italic = ItalicSpan(spans_inner)
return [*spans_before, italic, *spans_after]
# Should never happen
return parse_breaks(text)
def parse_breaks(text: str) -> Spans:
if not text:
return []
splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
spans: Spans = [
splits[i // 2] if i % 2 == 0 else LineBreak()
for i in range(0, 2 * len(splits) - 1)
]
return spans
if not text:
return []
splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
spans: Spans = [
splits[i // 2] if i % 2 == 0 else LineBreak()
for i in range(0, 2 * len(splits) - 1)
]
return spans

View File

@ -1,4 +1,4 @@
[mypy]
ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
; mypy stable doesn't support pyproject.toml yet

View File

@ -17,11 +17,11 @@ black = "^21.5b2"
mypy = "^0.812"
[tool.black]
extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/parser/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
[tool.mypy]
ignore_missing_imports = true
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
[tool.pytest.ini_options]
addopts = "--show-capture=log"