Incorporate parser into new code #12
|
@ -2,13 +2,14 @@
|
||||||
Module encapsulating all markdown parsing functionality.
|
Module encapsulating all markdown parsing functionality.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from .core import normalize_title
|
from .core import RenderableVisitor
|
||||||
from .helpers import titlesort, filesafe_title
|
from .helpers import normalize_title, filesafe_title, titlesort
|
||||||
from .parsing import parse_raw_markdown
|
from .parsing import parse_raw_markdown
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
normalize_title.__name__,
|
"RenderableVisitor",
|
||||||
titlesort.__name__,
|
"normalize_title",
|
||||||
filesafe_title.__name__,
|
"filesafe_title",
|
||||||
parse_raw_markdown.__name__,
|
"titlesort",
|
||||||
|
"parse_raw_markdown",
|
||||||
]
|
]
|
||||||
|
|
|
@ -5,32 +5,26 @@ which can be operated on by a visitor defining functions that hook off
|
||||||
of the different token types.
|
of the different token types.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
|
||||||
from typing import Callable, Any, Sequence
|
from typing import Callable, Any, Sequence
|
||||||
|
|
||||||
RenderHook = Callable[['Renderable'], Any]
|
from .helpers import normalize_title
|
||||||
Spans = Sequence['Renderable']
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_title(title: str) -> str:
|
RenderHook = Callable[["Renderable"], Any]
|
||||||
"""
|
Spans = Sequence["Renderable"]
|
||||||
Normalizes strings as titles:
|
|
||||||
- Strips leading and trailing whitespace
|
|
||||||
- Merges internal whitespace into a single space
|
|
||||||
- Capitalizes the first word
|
|
||||||
"""
|
|
||||||
cleaned = re.sub(r'\s+', " ", title.strip())
|
|
||||||
return cleaned[:1].capitalize() + cleaned[1:]
|
|
||||||
|
|
||||||
|
|
||||||
class Renderable():
|
class Renderable:
|
||||||
"""
|
"""
|
||||||
Base class for parsed markdown. Provides the `render()` method for
|
Base class for parsed markdown. Provides the `render()` method for
|
||||||
visiting the token tree.
|
visiting the token tree.
|
||||||
"""
|
"""
|
||||||
def render(self: 'Renderable', renderer: 'RenderableVisitor'):
|
|
||||||
|
def render(self: "Renderable", renderer: "RenderableVisitor"):
|
||||||
"""
|
"""
|
||||||
Execute the apppropriate visitor method on this Renderable.
|
Execute the apppropriate visitor method on this Renderable.
|
||||||
|
Visitors implement hooks by declaring methods whose names are
|
||||||
|
the name of a Renderable class.
|
||||||
"""
|
"""
|
||||||
hook: RenderHook = getattr(renderer, type(self).__name__, None)
|
hook: RenderHook = getattr(renderer, type(self).__name__, None)
|
||||||
if hook:
|
if hook:
|
||||||
|
@ -39,7 +33,8 @@ class Renderable():
|
||||||
|
|
||||||
|
|
||||||
class TextSpan(Renderable):
|
class TextSpan(Renderable):
|
||||||
"""An unstyled length of text."""
|
"""A length of text."""
|
||||||
|
|
||||||
def __init__(self, innertext: str):
|
def __init__(self, innertext: str):
|
||||||
self.innertext = innertext
|
self.innertext = innertext
|
||||||
|
|
||||||
|
@ -49,20 +44,24 @@ class TextSpan(Renderable):
|
||||||
|
|
||||||
class LineBreak(Renderable):
|
class LineBreak(Renderable):
|
||||||
"""A line break within a paragraph."""
|
"""A line break within a paragraph."""
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "<break>"
|
return "<break>"
|
||||||
|
|
||||||
|
|
||||||
class SpanContainer(Renderable):
|
class SpanContainer(Renderable):
|
||||||
"""A formatting element that wraps some amount of text."""
|
"""A formatting element that wraps some amount of text."""
|
||||||
|
|
||||||
def __init__(self, spans: Spans):
|
def __init__(self, spans: Spans):
|
||||||
self.spans: Spans = spans
|
self.spans: Spans = spans
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return (f'[{type(self).__name__} '
|
return (
|
||||||
+ f'{" ".join([str(span) for span in self.spans])}]')
|
f"[{type(self).__name__} "
|
||||||
|
+ f'{" ".join([str(span) for span in self.spans])}]'
|
||||||
|
)
|
||||||
|
|
||||||
def recurse(self, renderer: 'RenderableVisitor'):
|
def recurse(self, renderer: "RenderableVisitor"):
|
||||||
return [child.render(renderer) for child in self.spans]
|
return [child.render(renderer) for child in self.spans]
|
||||||
|
|
||||||
|
|
||||||
|
@ -88,22 +87,26 @@ class ItalicSpan(SpanContainer):
|
||||||
|
|
||||||
class CitationSpan(SpanContainer):
|
class CitationSpan(SpanContainer):
|
||||||
"""A citation to another article."""
|
"""A citation to another article."""
|
||||||
|
|
||||||
def __init__(self, spans: Spans, cite_target: str):
|
def __init__(self, spans: Spans, cite_target: str):
|
||||||
super().__init__(spans)
|
super().__init__(spans)
|
||||||
# Normalize citation target on parse, since we don't want
|
# Normalize citation target on parse, since we don't want
|
||||||
# abnormal title strings lying around causing trouble.
|
# abnormal title strings lying around causing trouble.
|
||||||
self.cite_target: str = normalize_title(cite_target)
|
self.cite_target: str = normalize_title(cite_target)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self) -> str:
|
||||||
return (f'{{{" ".join([str(span) for span in self.spans])}'
|
return (
|
||||||
+ f':{self.cite_target}}}')
|
f'{{{" ".join([str(span) for span in self.spans])}'
|
||||||
|
+ f":{self.cite_target}}}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class RenderableVisitor():
|
class RenderableVisitor:
|
||||||
"""
|
"""
|
||||||
Default implementation of the visitor pattern. Executes once on
|
Default implementation of the visitor pattern. Executes once on
|
||||||
each token in the tree and returns itself.
|
each token in the tree and returns itself.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def TextSpan(self, span: TextSpan):
|
def TextSpan(self, span: TextSpan):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,22 @@
|
||||||
|
"""
|
||||||
|
Helper functions for manipulating titles during parsing
|
||||||
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_title(title: str) -> str:
|
||||||
|
"""
|
||||||
|
Normalizes strings as titles:
|
||||||
|
- Strips leading and trailing whitespace
|
||||||
|
- Merges internal whitespace into a single space
|
||||||
|
- Capitalizes the first word
|
||||||
|
"""
|
||||||
|
cleaned = re.sub(r"\s+", " ", title.strip())
|
||||||
|
return cleaned[:1].capitalize() + cleaned[1:]
|
||||||
|
|
||||||
|
|
||||||
def titlesort(title: str) -> str:
|
def titlesort(title: str) -> str:
|
||||||
"""
|
"""
|
||||||
Strips articles off of titles for alphabetical sorting purposes
|
Strips articles off of titles for alphabetical sorting purposes
|
||||||
|
@ -20,9 +35,19 @@ def filesafe_title(title: str) -> str:
|
||||||
"""
|
"""
|
||||||
Makes an article title filename-safe.
|
Makes an article title filename-safe.
|
||||||
"""
|
"""
|
||||||
s = re.sub(r"\s+", '_', title) # Replace whitespace with _
|
# Replace whitespace with _
|
||||||
s = re.sub(r"~", '-', s) # parse.quote doesn't catch ~
|
s = re.sub(r"\s+", "_", title)
|
||||||
s = urllib.parse.quote(s) # Encode all other characters
|
|
||||||
s = re.sub(r"%", "", s) # Strip encoding %s
|
# parse.quote doesn't catch ~
|
||||||
s = s[:64] # Limit to 64 characters
|
s = re.sub(r"~", "-", s)
|
||||||
|
|
||||||
|
# Encode all other characters
|
||||||
|
s = urllib.parse.quote(s)
|
||||||
|
|
||||||
|
# Strip encoding %s
|
||||||
|
s = re.sub(r"%", "", s)
|
||||||
|
|
||||||
|
# Limit to 64 characters
|
||||||
|
s = s[:64]
|
||||||
|
|
||||||
return s
|
return s
|
||||||
|
|
|
@ -16,7 +16,7 @@ from .core import (
|
||||||
ItalicSpan,
|
ItalicSpan,
|
||||||
CitationSpan,
|
CitationSpan,
|
||||||
Renderable,
|
Renderable,
|
||||||
SpanContainer
|
SpanContainer,
|
||||||
)
|
)
|
||||||
|
|
||||||
Spans = Sequence[Renderable]
|
Spans = Sequence[Renderable]
|
||||||
|
@ -28,7 +28,7 @@ def parse_raw_markdown(text: str) -> ParsedArticle:
|
||||||
"""
|
"""
|
||||||
# Parse each paragraph individually, as no formatting applies
|
# Parse each paragraph individually, as no formatting applies
|
||||||
# across paragraphs
|
# across paragraphs
|
||||||
paragraphs = re.split(r'\n\n+', text)
|
paragraphs = re.split(r"\n\n+", text)
|
||||||
parse_results = list(map(parse_paragraph, paragraphs))
|
parse_results = list(map(parse_paragraph, paragraphs))
|
||||||
return ParsedArticle(parse_results)
|
return ParsedArticle(parse_results)
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ def parse_raw_markdown(text: str) -> ParsedArticle:
|
||||||
def parse_paragraph(text: str) -> SpanContainer:
|
def parse_paragraph(text: str) -> SpanContainer:
|
||||||
# Parse the paragraph as a span of text
|
# Parse the paragraph as a span of text
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
if text and text[0] == '~':
|
if text and text[0] == "~":
|
||||||
return SignatureParagraph(parse_paired_formatting(text[1:]))
|
return SignatureParagraph(parse_paired_formatting(text[1:]))
|
||||||
else:
|
else:
|
||||||
return BodyParagraph(parse_paired_formatting(text))
|
return BodyParagraph(parse_paired_formatting(text))
|
||||||
|
@ -46,7 +46,8 @@ def parse_paired_formatting(
|
||||||
text: str,
|
text: str,
|
||||||
cite: bool = True,
|
cite: bool = True,
|
||||||
bold: bool = True,
|
bold: bool = True,
|
||||||
italic: bool = True) -> Spans:
|
italic: bool = True,
|
||||||
|
) -> Spans:
|
||||||
# Find positions of any paired formatting
|
# Find positions of any paired formatting
|
||||||
first_cite = find_pair(text, "[[", "]]", cite)
|
first_cite = find_pair(text, "[[", "]]", cite)
|
||||||
first_bold = find_pair(text, "**", "**", bold)
|
first_bold = find_pair(text, "**", "**", bold)
|
||||||
|
@ -68,7 +69,8 @@ def find_pair(
|
||||||
text: str,
|
text: str,
|
||||||
open_tag: str,
|
open_tag: str,
|
||||||
close_tag: str,
|
close_tag: str,
|
||||||
valid: bool) -> int:
|
valid: bool,
|
||||||
|
) -> int:
|
||||||
# If skipping, return -1
|
# If skipping, return -1
|
||||||
if not valid:
|
if not valid:
|
||||||
return -1
|
return -1
|
||||||
|
@ -84,7 +86,11 @@ def find_pair(
|
||||||
return first
|
return first
|
||||||
|
|
||||||
|
|
||||||
def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans:
|
def parse_citation(
|
||||||
|
text: str,
|
||||||
|
bold: bool = True,
|
||||||
|
italic: bool = True,
|
||||||
|
) -> Spans:
|
||||||
cite_open = text.find("[[")
|
cite_open = text.find("[[")
|
||||||
if cite_open > -1:
|
if cite_open > -1:
|
||||||
cite_close = text.find("]]", cite_open + 2)
|
cite_close = text.find("]]", cite_open + 2)
|
||||||
|
@ -94,51 +100,62 @@ def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans:
|
||||||
spans_before = parse_breaks(text[:cite_open])
|
spans_before = parse_breaks(text[:cite_open])
|
||||||
# Continue parsing pair formatting after this one closes with all
|
# Continue parsing pair formatting after this one closes with all
|
||||||
# three as valid choices
|
# three as valid choices
|
||||||
spans_after = parse_paired_formatting(text[cite_close + 2:])
|
spans_after = parse_paired_formatting(text[cite_close + 2 :])
|
||||||
# Parse inner text and skip parsing for this format pair
|
# Parse inner text and skip parsing for this format pair
|
||||||
text_inner = text[cite_open + 2:cite_close]
|
text_inner = text[cite_open + 2 : cite_close]
|
||||||
# For citations specifically, we may need to split off a citation
|
# For citations specifically, we may need to split off a citation
|
||||||
# target from the alias text
|
# target from the alias text
|
||||||
inner_split = text_inner.split("|", 1)
|
inner_split = text_inner.split("|", 1)
|
||||||
text_inner_actual, cite_target = inner_split[0], inner_split[-1]
|
text_inner_actual, cite_target = inner_split[0], inner_split[-1]
|
||||||
spans_inner = parse_paired_formatting(text_inner_actual,
|
spans_inner = parse_paired_formatting(
|
||||||
cite=False, bold=bold, italic=italic)
|
text_inner_actual, cite=False, bold=bold, italic=italic
|
||||||
|
)
|
||||||
citation = CitationSpan(spans_inner, cite_target)
|
citation = CitationSpan(spans_inner, cite_target)
|
||||||
return [*spans_before, citation, *spans_after]
|
return [*spans_before, citation, *spans_after]
|
||||||
# Should never happen
|
# Should never happen
|
||||||
return parse_breaks(text)
|
return parse_breaks(text)
|
||||||
|
|
||||||
|
|
||||||
def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans:
|
def parse_bold(
|
||||||
|
text: str,
|
||||||
|
cite: bool = True,
|
||||||
|
italic: bool = True,
|
||||||
|
) -> Spans:
|
||||||
bold_open = text.find("**")
|
bold_open = text.find("**")
|
||||||
if bold_open > -1:
|
if bold_open > -1:
|
||||||
bold_close = text.find("**", bold_open + 2)
|
bold_close = text.find("**", bold_open + 2)
|
||||||
# Should be no formatting behind us
|
# Should be no formatting behind us
|
||||||
spans_before = parse_breaks(text[:bold_open])
|
spans_before = parse_breaks(text[:bold_open])
|
||||||
# Freely parse formatting after us
|
# Freely parse formatting after us
|
||||||
spans_after = parse_paired_formatting(text[bold_close + 2:])
|
spans_after = parse_paired_formatting(text[bold_close + 2 :])
|
||||||
# Parse inner text minus bold parsing
|
# Parse inner text minus bold parsing
|
||||||
text_inner = text[bold_open + 2:bold_close]
|
text_inner = text[bold_open + 2 : bold_close]
|
||||||
spans_inner = parse_paired_formatting(text_inner,
|
spans_inner = parse_paired_formatting(
|
||||||
cite=cite, bold=False, italic=italic)
|
text_inner, cite=cite, bold=False, italic=italic
|
||||||
|
)
|
||||||
bold = BoldSpan(spans_inner)
|
bold = BoldSpan(spans_inner)
|
||||||
return [*spans_before, bold, *spans_after]
|
return [*spans_before, bold, *spans_after]
|
||||||
# Should never happen
|
# Should never happen
|
||||||
return parse_italic(text)
|
return parse_italic(text)
|
||||||
|
|
||||||
|
|
||||||
def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans:
|
def parse_italic(
|
||||||
|
text: str,
|
||||||
|
cite: bool = True,
|
||||||
|
bold: bool = True,
|
||||||
|
) -> Spans:
|
||||||
italic_open = text.find("//")
|
italic_open = text.find("//")
|
||||||
if italic_open > -1:
|
if italic_open > -1:
|
||||||
italic_close = text.find("//", italic_open + 2)
|
italic_close = text.find("//", italic_open + 2)
|
||||||
# Should be no formatting behind us
|
# Should be no formatting behind us
|
||||||
spans_before = parse_breaks(text[:italic_open])
|
spans_before = parse_breaks(text[:italic_open])
|
||||||
# Freely parse formatting after us
|
# Freely parse formatting after us
|
||||||
spans_after = parse_paired_formatting(text[italic_close + 2:])
|
spans_after = parse_paired_formatting(text[italic_close + 2 :])
|
||||||
# Parse inner text minus italic parsing
|
# Parse inner text minus italic parsing
|
||||||
text_inner = text[italic_open + 2:italic_close]
|
text_inner = text[italic_open + 2 : italic_close]
|
||||||
spans_inner = parse_paired_formatting(text_inner,
|
spans_inner = parse_paired_formatting(
|
||||||
cite=cite, bold=bold, italic=False)
|
text_inner, cite=cite, bold=bold, italic=False
|
||||||
|
)
|
||||||
italic = ItalicSpan(spans_inner)
|
italic = ItalicSpan(spans_inner)
|
||||||
return [*spans_before, italic, *spans_after]
|
return [*spans_before, italic, *spans_after]
|
||||||
# Should never happen
|
# Should never happen
|
||||||
|
|
2
mypy.ini
2
mypy.ini
|
@ -1,4 +1,4 @@
|
||||||
[mypy]
|
[mypy]
|
||||||
ignore_missing_imports = true
|
ignore_missing_imports = true
|
||||||
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
|
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
|
||||||
; mypy stable doesn't support pyproject.toml yet
|
; mypy stable doesn't support pyproject.toml yet
|
|
@ -17,11 +17,11 @@ black = "^21.5b2"
|
||||||
mypy = "^0.812"
|
mypy = "^0.812"
|
||||||
|
|
||||||
[tool.black]
|
[tool.black]
|
||||||
extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/parser/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
|
extend-exclude = "^/amanuensis/cli/.*|^/amanuensis/config/.*|^/amanuensis/lexicon/.*|^/amanuensis/log/.*|^/amanuensis/models/.*|^/amanuensis/resources/.*|^/amanuensis/server/.*|^/amanuensis/user/.*|^/amanuensis/__main__.py"
|
||||||
|
|
||||||
[tool.mypy]
|
[tool.mypy]
|
||||||
ignore_missing_imports = true
|
ignore_missing_imports = true
|
||||||
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/parser/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
|
exclude = "amanuensis/cli/.*|amanuensis/config/.*|amanuensis/lexicon/.*|amanuensis/log/.*|amanuensis/models/.*|amanuensis/resources/.*|amanuensis/server/.*|amanuensis/user/.*|amanuensis/__main__.py"
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
addopts = "--show-capture=log"
|
addopts = "--show-capture=log"
|
||||||
|
|
Loading…
Reference in New Issue