Refactor and annotate parser submodule
This commit is contained in:
parent
d0f57c85ce
commit
9e4144eccf
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@ __pycache__/
|
||||
*.egg-info
|
||||
venv/
|
||||
.vscode
|
||||
.mypy_cache
|
@ -1,8 +1,18 @@
|
||||
"""
|
||||
Module encapsulating all markdown parsing functionality
|
||||
Module encapsulating all markdown parsing functionality.
|
||||
"""
|
||||
|
||||
from amanuensis.parser.analyze import FeatureCounter, GetCitations
|
||||
from amanuensis.parser.helpers import titlesort, filesafe_title
|
||||
from amanuensis.parser.tokenizer import parse_raw_markdown
|
||||
from amanuensis.parser.render import PreviewHtmlRenderer, HtmlRenderer
|
||||
from amanuensis.parser.parsing import parse_raw_markdown
|
||||
from amanuensis.parser.render import PreviewHtmlRenderer, HtmlRenderer
|
||||
|
||||
__all__ = [
|
||||
'FeatureCounter',
|
||||
'GetCitations',
|
||||
'titlesort',
|
||||
'filesafe_title',
|
||||
'parse_raw_markdown',
|
||||
'PreviewHtmlRenderer',
|
||||
'HtmlRenderer',
|
||||
]
|
||||
|
@ -5,41 +5,22 @@ for verification against constraints.
|
||||
|
||||
import re
|
||||
|
||||
class RenderableVisitor():
|
||||
"""Default implementation of the visitor pattern"""
|
||||
def TextSpan(self, span):
|
||||
return self
|
||||
def LineBreak(self, span):
|
||||
return self
|
||||
def ParsedArticle(self, span):
|
||||
span.recurse(self)
|
||||
return self
|
||||
def BodyParagraph(self, span):
|
||||
span.recurse(self)
|
||||
return self
|
||||
def SignatureParagraph(self, span):
|
||||
span.recurse(self)
|
||||
return self
|
||||
def BoldSpan(self, span):
|
||||
span.recurse(self)
|
||||
return self
|
||||
def ItalicSpan(self, span):
|
||||
span.recurse(self)
|
||||
return self
|
||||
def CitationSpan(self, span):
|
||||
span.recurse(self)
|
||||
return self
|
||||
from amanuensis.parser.core import RenderableVisitor
|
||||
|
||||
|
||||
class GetCitations(RenderableVisitor):
|
||||
def __init__(self):
|
||||
self.citations = []
|
||||
|
||||
def ParsedArticle(self, span):
|
||||
span.recurse(self)
|
||||
return self.citations
|
||||
|
||||
def CitationSpan(self, span):
|
||||
self.citations.append(span.cite_target)
|
||||
return self
|
||||
|
||||
|
||||
class FeatureCounter(RenderableVisitor):
|
||||
def __init__(self):
|
||||
self.word_count = 0
|
||||
@ -47,7 +28,7 @@ class FeatureCounter(RenderableVisitor):
|
||||
self.has_signature = False
|
||||
|
||||
def TextSpan(self, span):
|
||||
self.word_count += len(re.split('\s+', span.innertext.strip()))
|
||||
self.word_count += len(re.split(r'\s+', span.innertext.strip()))
|
||||
return self
|
||||
|
||||
def SignatureParagraph(self, span):
|
||||
|
135
amanuensis/parser/core.py
Normal file
135
amanuensis/parser/core.py
Normal file
@ -0,0 +1,135 @@
|
||||
"""
|
||||
Internal module encapsulating the core types for parsing Lexipython
|
||||
markdown. Parsed articles are represented as a hierarchy of tokens,
|
||||
which can be operated on by a visitor defining functions that hook off
|
||||
of the different token types.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Callable, Any, Sequence
|
||||
|
||||
RenderHook = Callable[['Renderable'], Any]
|
||||
Spans = Sequence['Renderable']
|
||||
|
||||
|
||||
def normalize_title(title: str) -> str:
|
||||
"""
|
||||
Normalizes strings as titles:
|
||||
- Strips leading and trailing whitespace
|
||||
- Merges internal whitespace into a single space
|
||||
- Capitalizes the first word
|
||||
"""
|
||||
cleaned = re.sub(r'\s+', " ", title.strip())
|
||||
return cleaned[:1].capitalize() + cleaned[1:]
|
||||
|
||||
|
||||
class Renderable():
|
||||
"""
|
||||
Base class for parsed markdown. Provides the `render()` method for
|
||||
visiting the token tree.
|
||||
"""
|
||||
def render(self: 'Renderable', renderer: 'RenderableVisitor'):
|
||||
"""
|
||||
Execute the apppropriate visitor method on this Renderable.
|
||||
"""
|
||||
hook: RenderHook = getattr(renderer, type(self).__name__, None)
|
||||
if hook:
|
||||
return hook(self)
|
||||
return None
|
||||
|
||||
|
||||
class TextSpan(Renderable):
|
||||
"""An unstyled length of text."""
|
||||
def __init__(self, innertext: str):
|
||||
self.innertext = innertext
|
||||
|
||||
def __str__(self):
|
||||
return f"[{self.innertext}]"
|
||||
|
||||
|
||||
class LineBreak(Renderable):
|
||||
"""A line break within a paragraph."""
|
||||
def __str__(self):
|
||||
return "<break>"
|
||||
|
||||
|
||||
class SpanContainer(Renderable):
|
||||
"""A formatting element that wraps some amount of text."""
|
||||
def __init__(self, spans: Spans):
|
||||
self.spans: Spans = spans
|
||||
|
||||
def __str__(self):
|
||||
return (f'[{type(self).__name__} '
|
||||
+ f'{" ".join([str(span) for span in self.spans])}]')
|
||||
|
||||
def recurse(self, renderer: 'RenderableVisitor'):
|
||||
return [child.render(renderer) for child in self.spans]
|
||||
|
||||
|
||||
class ParsedArticle(SpanContainer):
|
||||
"""Token tree root node, containing some number of paragraph tokens."""
|
||||
|
||||
|
||||
class BodyParagraph(SpanContainer):
|
||||
"""A normal paragraph."""
|
||||
|
||||
|
||||
class SignatureParagraph(SpanContainer):
|
||||
"""A paragraph preceded by a signature mark."""
|
||||
|
||||
|
||||
class BoldSpan(SpanContainer):
|
||||
"""A span of text inside bold marks."""
|
||||
|
||||
|
||||
class ItalicSpan(SpanContainer):
|
||||
"""A span of text inside italic marks."""
|
||||
|
||||
|
||||
class CitationSpan(SpanContainer):
|
||||
"""A citation to another article."""
|
||||
def __init__(self, spans: Spans, cite_target: str):
|
||||
super().__init__(spans)
|
||||
# Normalize citation target on parse, since we don't want
|
||||
# abnormal title strings lying around causing trouble.
|
||||
self.cite_target: str = normalize_title(cite_target)
|
||||
|
||||
def __str__(self):
|
||||
return (f'{{{" ".join([str(span) for span in self.spans])}'
|
||||
+ f':{self.cite_target}}}')
|
||||
|
||||
|
||||
class RenderableVisitor():
|
||||
"""
|
||||
Default implementation of the visitor pattern. Executes once on
|
||||
each token in the tree and returns itself.
|
||||
"""
|
||||
def TextSpan(self, span: TextSpan):
|
||||
return self
|
||||
|
||||
def LineBreak(self, span: LineBreak):
|
||||
return self
|
||||
|
||||
def ParsedArticle(self, span: ParsedArticle):
|
||||
span.recurse(self)
|
||||
return self
|
||||
|
||||
def BodyParagraph(self, span: BodyParagraph):
|
||||
span.recurse(self)
|
||||
return self
|
||||
|
||||
def SignatureParagraph(self, span: SignatureParagraph):
|
||||
span.recurse(self)
|
||||
return self
|
||||
|
||||
def BoldSpan(self, span: BoldSpan):
|
||||
span.recurse(self)
|
||||
return self
|
||||
|
||||
def ItalicSpan(self, span: ItalicSpan):
|
||||
span.recurse(self)
|
||||
return self
|
||||
|
||||
def CitationSpan(self, span: CitationSpan):
|
||||
span.recurse(self)
|
||||
return self
|
@ -1,31 +1,22 @@
|
||||
import re
|
||||
import urllib
|
||||
import urllib.parse
|
||||
|
||||
def normalize_title(title):
|
||||
"""
|
||||
Normalizes strings as titles:
|
||||
- Strips leading and trailing whitespace
|
||||
- Merges internal whitespace into a single space
|
||||
- Capitalizes the first word
|
||||
"""
|
||||
cleaned = re.sub(r'\s+', " ", title.strip())
|
||||
return cleaned[:1].capitalize() + cleaned[1:]
|
||||
|
||||
def titlesort(title):
|
||||
def titlesort(title: str) -> str:
|
||||
"""
|
||||
Strips articles off of titles for alphabetical sorting purposes
|
||||
"""
|
||||
lower = title.lower()
|
||||
if lower.startswith("the "):
|
||||
return lower[4:]
|
||||
elif lower.startswith("an "):
|
||||
if lower.startswith("an "):
|
||||
return lower[3:]
|
||||
elif lower.startswith("a "):
|
||||
if lower.startswith("a "):
|
||||
return lower[2:]
|
||||
else:
|
||||
return lower
|
||||
return lower
|
||||
|
||||
def filesafe_title(title):
|
||||
|
||||
def filesafe_title(title: str) -> str:
|
||||
"""
|
||||
Makes an article title filename-safe.
|
||||
"""
|
||||
@ -34,4 +25,4 @@ def filesafe_title(title):
|
||||
s = urllib.parse.quote(s) # Encode all other characters
|
||||
s = re.sub(r"%", "", s) # Strip encoding %s
|
||||
s = s[:64] # Limit to 64 characters
|
||||
return s
|
||||
return s
|
||||
|
@ -1,74 +1,39 @@
|
||||
"""
|
||||
Internal module encapsulating the parsing logic for Lexipython
|
||||
markdown. Parse results are represented as a hierarchy of tokens, which
|
||||
can be rendered by a renderer.
|
||||
Internal module encapsulating a recursive descent parser for
|
||||
Lexipython markdown.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Sequence
|
||||
|
||||
from amanuensis.parser.helpers import normalize_title
|
||||
from amanuensis.parser.core import (
|
||||
TextSpan,
|
||||
LineBreak,
|
||||
ParsedArticle,
|
||||
BodyParagraph,
|
||||
SignatureParagraph,
|
||||
BoldSpan,
|
||||
ItalicSpan,
|
||||
CitationSpan,
|
||||
Renderable,
|
||||
SpanContainer
|
||||
)
|
||||
|
||||
class Renderable():
|
||||
def render(self, renderer):
|
||||
hook = getattr(renderer, type(self).__name__, None)
|
||||
if hook:
|
||||
return hook(self)
|
||||
return None
|
||||
|
||||
class TextSpan(Renderable):
|
||||
"""An unstyled length of text"""
|
||||
def __init__(self, innertext):
|
||||
self.innertext = innertext
|
||||
def __str__(self):
|
||||
return f"[{self.innertext}]"
|
||||
|
||||
class LineBreak(Renderable):
|
||||
"""A line break within a paragraph"""
|
||||
def __str__(self):
|
||||
return "<break>"
|
||||
|
||||
class SpanContainer(Renderable):
|
||||
"""A formatting element that wraps some amount of text"""
|
||||
def __init__(self, spans):
|
||||
self.spans = spans
|
||||
def __str__(self):
|
||||
return f"[{type(self).__name__} {' '.join([str(span) for span in self.spans])}]"
|
||||
def recurse(self, renderer):
|
||||
return [child.render(renderer) for child in self.spans]
|
||||
|
||||
class ParsedArticle(SpanContainer):
|
||||
"""Multiple paragraphs"""
|
||||
|
||||
class BodyParagraph(SpanContainer):
|
||||
"""A normal paragraph"""
|
||||
|
||||
class SignatureParagraph(SpanContainer):
|
||||
"""A paragraph preceded by a signature mark"""
|
||||
|
||||
class BoldSpan(SpanContainer):
|
||||
"""A span of text inside bold marks"""
|
||||
|
||||
class ItalicSpan(SpanContainer):
|
||||
"""A span of text inside italic marks"""
|
||||
|
||||
class CitationSpan(SpanContainer):
|
||||
"""A citation to another article"""
|
||||
def __init__(self, spans, cite_target):
|
||||
super().__init__(spans)
|
||||
# Normalize citation target
|
||||
self.cite_target = normalize_title(cite_target)
|
||||
def __str__(self):
|
||||
return f"{{{' '.join([str(span) for span in self.spans])}:{self.cite_target}}}"
|
||||
Spans = Sequence[Renderable]
|
||||
|
||||
|
||||
def parse_raw_markdown(text):
|
||||
def parse_raw_markdown(text: str) -> ParsedArticle:
|
||||
"""
|
||||
Parses a body of Lexipython markdown into a Renderable tree.
|
||||
"""
|
||||
# Parse each paragraph individually, as no formatting applies
|
||||
# across paragraphs
|
||||
paragraphs = re.split(r'\n\n+', text)
|
||||
parse_results = list(map(parse_paragraph, paragraphs))
|
||||
return ParsedArticle(parse_results)
|
||||
|
||||
def parse_paragraph(text):
|
||||
|
||||
def parse_paragraph(text: str) -> SpanContainer:
|
||||
# Parse the paragraph as a span of text
|
||||
text = text.strip()
|
||||
if text and text[0] == '~':
|
||||
@ -76,7 +41,12 @@ def parse_paragraph(text):
|
||||
else:
|
||||
return BodyParagraph(parse_paired_formatting(text))
|
||||
|
||||
def parse_paired_formatting(text, cite=True, bold=True, italic=True):
|
||||
|
||||
def parse_paired_formatting(
|
||||
text: str,
|
||||
cite: bool = True,
|
||||
bold: bool = True,
|
||||
italic: bool = True) -> Spans:
|
||||
# Find positions of any paired formatting
|
||||
first_cite = find_pair(text, "[[", "]]", cite)
|
||||
first_bold = find_pair(text, "**", "**", bold)
|
||||
@ -93,7 +63,12 @@ def parse_paired_formatting(text, cite=True, bold=True, italic=True):
|
||||
first = min(finds) if finds else -1
|
||||
return handlers[first]()
|
||||
|
||||
def find_pair(text, open_tag, close_tag, valid):
|
||||
|
||||
def find_pair(
|
||||
text: str,
|
||||
open_tag: str,
|
||||
close_tag: str,
|
||||
valid: bool) -> int:
|
||||
# If skipping, return -1
|
||||
if not valid:
|
||||
return -1
|
||||
@ -108,7 +83,8 @@ def find_pair(text, open_tag, close_tag, valid):
|
||||
# Otherwise, the pair exists
|
||||
return first
|
||||
|
||||
def parse_citation(text, bold=True, italic=True):
|
||||
|
||||
def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans:
|
||||
cite_open = text.find("[[")
|
||||
if cite_open > -1:
|
||||
cite_close = text.find("]]", cite_open + 2)
|
||||
@ -128,50 +104,53 @@ def parse_citation(text, bold=True, italic=True):
|
||||
spans_inner = parse_paired_formatting(text_inner_actual,
|
||||
cite=False, bold=bold, italic=italic)
|
||||
citation = CitationSpan(spans_inner, cite_target)
|
||||
return spans_before + [citation] + spans_after
|
||||
return [*spans_before, citation, *spans_after]
|
||||
# Should never happen
|
||||
return parse_breaks(text)
|
||||
|
||||
def parse_bold(text, cite=True, italic=True):
|
||||
|
||||
def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans:
|
||||
bold_open = text.find("**")
|
||||
if bold_open > -1:
|
||||
bold_close = text.find("**", bold_open + 2)
|
||||
# Should be no formatting behind us
|
||||
spans_before = parse_breaks(text[:bold_open])
|
||||
# Freely parse formatting after us
|
||||
spans_after = parse_paired_formatting(text[bold_close+2:])
|
||||
spans_after = parse_paired_formatting(text[bold_close + 2:])
|
||||
# Parse inner text minus bold parsing
|
||||
text_inner = text[bold_open+2:bold_close]
|
||||
text_inner = text[bold_open + 2:bold_close]
|
||||
spans_inner = parse_paired_formatting(text_inner,
|
||||
cite=cite, bold=False, italic=italic)
|
||||
bold = BoldSpan(spans_inner)
|
||||
return spans_before + [bold] + spans_after
|
||||
return [*spans_before, bold, *spans_after]
|
||||
# Should never happen
|
||||
return parse_italic(text)
|
||||
|
||||
def parse_italic(text, cite=True, bold=True):
|
||||
|
||||
def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans:
|
||||
italic_open = text.find("//")
|
||||
if italic_open > -1:
|
||||
italic_close = text.find("//", italic_open + 2)
|
||||
# Should be no formatting behind us
|
||||
spans_before = parse_breaks(text[:italic_open])
|
||||
# Freely parse formatting after us
|
||||
spans_after = parse_paired_formatting(text[italic_close+2:])
|
||||
spans_after = parse_paired_formatting(text[italic_close + 2:])
|
||||
# Parse inner text minus italic parsing
|
||||
text_inner = text[italic_open+2:italic_close]
|
||||
text_inner = text[italic_open + 2:italic_close]
|
||||
spans_inner = parse_paired_formatting(text_inner,
|
||||
cite=cite, bold=bold, italic=False)
|
||||
italic = ItalicSpan(spans_inner)
|
||||
return spans_before + [italic] + spans_after
|
||||
return [*spans_before, italic, *spans_after]
|
||||
# Should never happen
|
||||
return parse_breaks(text)
|
||||
|
||||
def parse_breaks(text):
|
||||
|
||||
def parse_breaks(text: str) -> Spans:
|
||||
if not text:
|
||||
return []
|
||||
splits = list(map(TextSpan, text.split("\\\\\n")))
|
||||
spans = [splits[0]]
|
||||
for span in splits[1:]:
|
||||
spans.append(LineBreak())
|
||||
spans.append(span)
|
||||
splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
|
||||
spans: Spans = [
|
||||
splits[i // 2] if i % 2 == 0 else LineBreak()
|
||||
for i in range(0, 2 * len(splits) - 1)
|
||||
]
|
||||
return spans
|
@ -3,7 +3,7 @@ Internal module encapsulating visitors that render articles into
|
||||
readable formats.
|
||||
"""
|
||||
|
||||
from flask import url_for
|
||||
from typing import Iterable
|
||||
|
||||
from amanuensis.parser.helpers import filesafe_title
|
||||
|
||||
@ -12,9 +12,9 @@ class HtmlRenderer():
|
||||
"""
|
||||
Renders an article token tree into published article HTML.
|
||||
"""
|
||||
def __init__(self, lexicon_name, written_articles):
|
||||
self.lexicon_name = lexicon_name
|
||||
self.written_articles = written_articles
|
||||
def __init__(self, lexicon_name: str, written_articles: Iterable[str]):
|
||||
self.lexicon_name: str = lexicon_name
|
||||
self.written_articles: Iterable[str] = written_articles
|
||||
|
||||
def TextSpan(self, span):
|
||||
return span.innertext
|
||||
@ -50,11 +50,11 @@ class HtmlRenderer():
|
||||
# 'lexicon.article',
|
||||
# name=self.lexicon_name,
|
||||
# title=filesafe_title(span.cite_target))
|
||||
link = f'/lexicon/{self.lexicon_name}/article/{filesafe_title(span.cite_target)}'
|
||||
link = (f'/lexicon/{self.lexicon_name}'
|
||||
+ f'/article/{filesafe_title(span.cite_target)}')
|
||||
return f'<a href="{link}"{link_class}>{"".join(span.recurse(self))}</a>'
|
||||
|
||||
|
||||
|
||||
class PreviewHtmlRenderer():
|
||||
def __init__(self, lexicon):
|
||||
with lexicon.ctx.read('info') as info:
|
||||
|
@ -1,5 +1,7 @@
|
||||
astroid==2.3.3
|
||||
Click==7.0
|
||||
entrypoints==0.3
|
||||
flake8==3.7.9
|
||||
Flask==1.1.1
|
||||
Flask-Login==0.4.1
|
||||
Flask-WTF==0.14.2
|
||||
@ -9,10 +11,14 @@ Jinja2==2.10.3
|
||||
lazy-object-proxy==1.4.3
|
||||
MarkupSafe==1.1.1
|
||||
mccabe==0.6.1
|
||||
mypy==0.770
|
||||
mypy-extensions==0.4.3
|
||||
pkg-resources==0.0.0
|
||||
pylint==2.4.4
|
||||
pycodestyle==2.5.0
|
||||
pyflakes==2.1.1
|
||||
six==1.14.0
|
||||
typed-ast==1.4.1
|
||||
typing-extensions==3.7.4.2
|
||||
Werkzeug==0.16.0
|
||||
wrapt==1.11.2
|
||||
WTForms==2.2.1
|
||||
|
Loading…
Reference in New Issue
Block a user