Refactor and annotate parser submodule

This commit is contained in:
Tim Van Baak 2020-04-21 12:52:02 -07:00
parent d0f57c85ce
commit 9e4144eccf
9 changed files with 243 additions and 127 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@ __pycache__/
*.egg-info *.egg-info
venv/ venv/
.vscode .vscode
.mypy_cache

View File

@ -1,8 +1,18 @@
""" """
Module encapsulating all markdown parsing functionality Module encapsulating all markdown parsing functionality.
""" """
from amanuensis.parser.analyze import FeatureCounter, GetCitations from amanuensis.parser.analyze import FeatureCounter, GetCitations
from amanuensis.parser.helpers import titlesort, filesafe_title from amanuensis.parser.helpers import titlesort, filesafe_title
from amanuensis.parser.tokenizer import parse_raw_markdown from amanuensis.parser.parsing import parse_raw_markdown
from amanuensis.parser.render import PreviewHtmlRenderer, HtmlRenderer from amanuensis.parser.render import PreviewHtmlRenderer, HtmlRenderer
__all__ = [
'FeatureCounter',
'GetCitations',
'titlesort',
'filesafe_title',
'parse_raw_markdown',
'PreviewHtmlRenderer',
'HtmlRenderer',
]

View File

@ -5,41 +5,22 @@ for verification against constraints.
import re import re
class RenderableVisitor(): from amanuensis.parser.core import RenderableVisitor
"""Default implementation of the visitor pattern"""
def TextSpan(self, span):
return self
def LineBreak(self, span):
return self
def ParsedArticle(self, span):
span.recurse(self)
return self
def BodyParagraph(self, span):
span.recurse(self)
return self
def SignatureParagraph(self, span):
span.recurse(self)
return self
def BoldSpan(self, span):
span.recurse(self)
return self
def ItalicSpan(self, span):
span.recurse(self)
return self
def CitationSpan(self, span):
span.recurse(self)
return self
class GetCitations(RenderableVisitor): class GetCitations(RenderableVisitor):
def __init__(self): def __init__(self):
self.citations = [] self.citations = []
def ParsedArticle(self, span): def ParsedArticle(self, span):
span.recurse(self) span.recurse(self)
return self.citations return self.citations
def CitationSpan(self, span): def CitationSpan(self, span):
self.citations.append(span.cite_target) self.citations.append(span.cite_target)
return self return self
class FeatureCounter(RenderableVisitor): class FeatureCounter(RenderableVisitor):
def __init__(self): def __init__(self):
self.word_count = 0 self.word_count = 0
@ -47,7 +28,7 @@ class FeatureCounter(RenderableVisitor):
self.has_signature = False self.has_signature = False
def TextSpan(self, span): def TextSpan(self, span):
self.word_count += len(re.split('\s+', span.innertext.strip())) self.word_count += len(re.split(r'\s+', span.innertext.strip()))
return self return self
def SignatureParagraph(self, span): def SignatureParagraph(self, span):

135
amanuensis/parser/core.py Normal file
View File

@ -0,0 +1,135 @@
"""
Internal module encapsulating the core types for parsing Lexipython
markdown. Parsed articles are represented as a hierarchy of tokens,
which can be operated on by a visitor defining functions that hook off
of the different token types.
"""
import re
from typing import Callable, Any, Sequence
RenderHook = Callable[['Renderable'], Any]
Spans = Sequence['Renderable']
def normalize_title(title: str) -> str:
"""
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r'\s+', " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
class Renderable():
"""
Base class for parsed markdown. Provides the `render()` method for
visiting the token tree.
"""
def render(self: 'Renderable', renderer: 'RenderableVisitor'):
"""
Execute the apppropriate visitor method on this Renderable.
"""
hook: RenderHook = getattr(renderer, type(self).__name__, None)
if hook:
return hook(self)
return None
class TextSpan(Renderable):
"""An unstyled length of text."""
def __init__(self, innertext: str):
self.innertext = innertext
def __str__(self):
return f"[{self.innertext}]"
class LineBreak(Renderable):
"""A line break within a paragraph."""
def __str__(self):
return "<break>"
class SpanContainer(Renderable):
"""A formatting element that wraps some amount of text."""
def __init__(self, spans: Spans):
self.spans: Spans = spans
def __str__(self):
return (f'[{type(self).__name__} '
+ f'{" ".join([str(span) for span in self.spans])}]')
def recurse(self, renderer: 'RenderableVisitor'):
return [child.render(renderer) for child in self.spans]
class ParsedArticle(SpanContainer):
"""Token tree root node, containing some number of paragraph tokens."""
class BodyParagraph(SpanContainer):
"""A normal paragraph."""
class SignatureParagraph(SpanContainer):
"""A paragraph preceded by a signature mark."""
class BoldSpan(SpanContainer):
"""A span of text inside bold marks."""
class ItalicSpan(SpanContainer):
"""A span of text inside italic marks."""
class CitationSpan(SpanContainer):
"""A citation to another article."""
def __init__(self, spans: Spans, cite_target: str):
super().__init__(spans)
# Normalize citation target on parse, since we don't want
# abnormal title strings lying around causing trouble.
self.cite_target: str = normalize_title(cite_target)
def __str__(self):
return (f'{{{" ".join([str(span) for span in self.spans])}'
+ f':{self.cite_target}}}')
class RenderableVisitor():
"""
Default implementation of the visitor pattern. Executes once on
each token in the tree and returns itself.
"""
def TextSpan(self, span: TextSpan):
return self
def LineBreak(self, span: LineBreak):
return self
def ParsedArticle(self, span: ParsedArticle):
span.recurse(self)
return self
def BodyParagraph(self, span: BodyParagraph):
span.recurse(self)
return self
def SignatureParagraph(self, span: SignatureParagraph):
span.recurse(self)
return self
def BoldSpan(self, span: BoldSpan):
span.recurse(self)
return self
def ItalicSpan(self, span: ItalicSpan):
span.recurse(self)
return self
def CitationSpan(self, span: CitationSpan):
span.recurse(self)
return self

View File

@ -1,31 +1,22 @@
import re import re
import urllib import urllib.parse
def normalize_title(title):
"""
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r'\s+', " ", title.strip())
return cleaned[:1].capitalize() + cleaned[1:]
def titlesort(title): def titlesort(title: str) -> str:
""" """
Strips articles off of titles for alphabetical sorting purposes Strips articles off of titles for alphabetical sorting purposes
""" """
lower = title.lower() lower = title.lower()
if lower.startswith("the "): if lower.startswith("the "):
return lower[4:] return lower[4:]
elif lower.startswith("an "): if lower.startswith("an "):
return lower[3:] return lower[3:]
elif lower.startswith("a "): if lower.startswith("a "):
return lower[2:] return lower[2:]
else: return lower
return lower
def filesafe_title(title):
def filesafe_title(title: str) -> str:
""" """
Makes an article title filename-safe. Makes an article title filename-safe.
""" """
@ -34,4 +25,4 @@ def filesafe_title(title):
s = urllib.parse.quote(s) # Encode all other characters s = urllib.parse.quote(s) # Encode all other characters
s = re.sub(r"%", "", s) # Strip encoding %s s = re.sub(r"%", "", s) # Strip encoding %s
s = s[:64] # Limit to 64 characters s = s[:64] # Limit to 64 characters
return s return s

View File

@ -1,74 +1,39 @@
""" """
Internal module encapsulating the parsing logic for Lexipython Internal module encapsulating a recursive descent parser for
markdown. Parse results are represented as a hierarchy of tokens, which Lexipython markdown.
can be rendered by a renderer.
""" """
import re import re
from typing import Sequence
from amanuensis.parser.helpers import normalize_title from amanuensis.parser.core import (
TextSpan,
LineBreak,
ParsedArticle,
BodyParagraph,
SignatureParagraph,
BoldSpan,
ItalicSpan,
CitationSpan,
Renderable,
SpanContainer
)
class Renderable(): Spans = Sequence[Renderable]
def render(self, renderer):
hook = getattr(renderer, type(self).__name__, None)
if hook:
return hook(self)
return None
class TextSpan(Renderable):
"""An unstyled length of text"""
def __init__(self, innertext):
self.innertext = innertext
def __str__(self):
return f"[{self.innertext}]"
class LineBreak(Renderable):
"""A line break within a paragraph"""
def __str__(self):
return "<break>"
class SpanContainer(Renderable):
"""A formatting element that wraps some amount of text"""
def __init__(self, spans):
self.spans = spans
def __str__(self):
return f"[{type(self).__name__} {' '.join([str(span) for span in self.spans])}]"
def recurse(self, renderer):
return [child.render(renderer) for child in self.spans]
class ParsedArticle(SpanContainer):
"""Multiple paragraphs"""
class BodyParagraph(SpanContainer):
"""A normal paragraph"""
class SignatureParagraph(SpanContainer):
"""A paragraph preceded by a signature mark"""
class BoldSpan(SpanContainer):
"""A span of text inside bold marks"""
class ItalicSpan(SpanContainer):
"""A span of text inside italic marks"""
class CitationSpan(SpanContainer):
"""A citation to another article"""
def __init__(self, spans, cite_target):
super().__init__(spans)
# Normalize citation target
self.cite_target = normalize_title(cite_target)
def __str__(self):
return f"{{{' '.join([str(span) for span in self.spans])}:{self.cite_target}}}"
def parse_raw_markdown(text): def parse_raw_markdown(text: str) -> ParsedArticle:
"""
Parses a body of Lexipython markdown into a Renderable tree.
"""
# Parse each paragraph individually, as no formatting applies # Parse each paragraph individually, as no formatting applies
# across paragraphs # across paragraphs
paragraphs = re.split(r'\n\n+', text) paragraphs = re.split(r'\n\n+', text)
parse_results = list(map(parse_paragraph, paragraphs)) parse_results = list(map(parse_paragraph, paragraphs))
return ParsedArticle(parse_results) return ParsedArticle(parse_results)
def parse_paragraph(text):
def parse_paragraph(text: str) -> SpanContainer:
# Parse the paragraph as a span of text # Parse the paragraph as a span of text
text = text.strip() text = text.strip()
if text and text[0] == '~': if text and text[0] == '~':
@ -76,7 +41,12 @@ def parse_paragraph(text):
else: else:
return BodyParagraph(parse_paired_formatting(text)) return BodyParagraph(parse_paired_formatting(text))
def parse_paired_formatting(text, cite=True, bold=True, italic=True):
def parse_paired_formatting(
text: str,
cite: bool = True,
bold: bool = True,
italic: bool = True) -> Spans:
# Find positions of any paired formatting # Find positions of any paired formatting
first_cite = find_pair(text, "[[", "]]", cite) first_cite = find_pair(text, "[[", "]]", cite)
first_bold = find_pair(text, "**", "**", bold) first_bold = find_pair(text, "**", "**", bold)
@ -93,7 +63,12 @@ def parse_paired_formatting(text, cite=True, bold=True, italic=True):
first = min(finds) if finds else -1 first = min(finds) if finds else -1
return handlers[first]() return handlers[first]()
def find_pair(text, open_tag, close_tag, valid):
def find_pair(
text: str,
open_tag: str,
close_tag: str,
valid: bool) -> int:
# If skipping, return -1 # If skipping, return -1
if not valid: if not valid:
return -1 return -1
@ -108,7 +83,8 @@ def find_pair(text, open_tag, close_tag, valid):
# Otherwise, the pair exists # Otherwise, the pair exists
return first return first
def parse_citation(text, bold=True, italic=True):
def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans:
cite_open = text.find("[[") cite_open = text.find("[[")
if cite_open > -1: if cite_open > -1:
cite_close = text.find("]]", cite_open + 2) cite_close = text.find("]]", cite_open + 2)
@ -128,50 +104,53 @@ def parse_citation(text, bold=True, italic=True):
spans_inner = parse_paired_formatting(text_inner_actual, spans_inner = parse_paired_formatting(text_inner_actual,
cite=False, bold=bold, italic=italic) cite=False, bold=bold, italic=italic)
citation = CitationSpan(spans_inner, cite_target) citation = CitationSpan(spans_inner, cite_target)
return spans_before + [citation] + spans_after return [*spans_before, citation, *spans_after]
# Should never happen # Should never happen
return parse_breaks(text) return parse_breaks(text)
def parse_bold(text, cite=True, italic=True):
def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans:
bold_open = text.find("**") bold_open = text.find("**")
if bold_open > -1: if bold_open > -1:
bold_close = text.find("**", bold_open + 2) bold_close = text.find("**", bold_open + 2)
# Should be no formatting behind us # Should be no formatting behind us
spans_before = parse_breaks(text[:bold_open]) spans_before = parse_breaks(text[:bold_open])
# Freely parse formatting after us # Freely parse formatting after us
spans_after = parse_paired_formatting(text[bold_close+2:]) spans_after = parse_paired_formatting(text[bold_close + 2:])
# Parse inner text minus bold parsing # Parse inner text minus bold parsing
text_inner = text[bold_open+2:bold_close] text_inner = text[bold_open + 2:bold_close]
spans_inner = parse_paired_formatting(text_inner, spans_inner = parse_paired_formatting(text_inner,
cite=cite, bold=False, italic=italic) cite=cite, bold=False, italic=italic)
bold = BoldSpan(spans_inner) bold = BoldSpan(spans_inner)
return spans_before + [bold] + spans_after return [*spans_before, bold, *spans_after]
# Should never happen # Should never happen
return parse_italic(text) return parse_italic(text)
def parse_italic(text, cite=True, bold=True):
def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans:
italic_open = text.find("//") italic_open = text.find("//")
if italic_open > -1: if italic_open > -1:
italic_close = text.find("//", italic_open + 2) italic_close = text.find("//", italic_open + 2)
# Should be no formatting behind us # Should be no formatting behind us
spans_before = parse_breaks(text[:italic_open]) spans_before = parse_breaks(text[:italic_open])
# Freely parse formatting after us # Freely parse formatting after us
spans_after = parse_paired_formatting(text[italic_close+2:]) spans_after = parse_paired_formatting(text[italic_close + 2:])
# Parse inner text minus italic parsing # Parse inner text minus italic parsing
text_inner = text[italic_open+2:italic_close] text_inner = text[italic_open + 2:italic_close]
spans_inner = parse_paired_formatting(text_inner, spans_inner = parse_paired_formatting(text_inner,
cite=cite, bold=bold, italic=False) cite=cite, bold=bold, italic=False)
italic = ItalicSpan(spans_inner) italic = ItalicSpan(spans_inner)
return spans_before + [italic] + spans_after return [*spans_before, italic, *spans_after]
# Should never happen # Should never happen
return parse_breaks(text) return parse_breaks(text)
def parse_breaks(text):
def parse_breaks(text: str) -> Spans:
if not text: if not text:
return [] return []
splits = list(map(TextSpan, text.split("\\\\\n"))) splits: Spans = list(map(TextSpan, text.split("\\\\\n")))
spans = [splits[0]] spans: Spans = [
for span in splits[1:]: splits[i // 2] if i % 2 == 0 else LineBreak()
spans.append(LineBreak()) for i in range(0, 2 * len(splits) - 1)
spans.append(span) ]
return spans return spans

View File

@ -3,7 +3,7 @@ Internal module encapsulating visitors that render articles into
readable formats. readable formats.
""" """
from flask import url_for from typing import Iterable
from amanuensis.parser.helpers import filesafe_title from amanuensis.parser.helpers import filesafe_title
@ -12,9 +12,9 @@ class HtmlRenderer():
""" """
Renders an article token tree into published article HTML. Renders an article token tree into published article HTML.
""" """
def __init__(self, lexicon_name, written_articles): def __init__(self, lexicon_name: str, written_articles: Iterable[str]):
self.lexicon_name = lexicon_name self.lexicon_name: str = lexicon_name
self.written_articles = written_articles self.written_articles: Iterable[str] = written_articles
def TextSpan(self, span): def TextSpan(self, span):
return span.innertext return span.innertext
@ -50,11 +50,11 @@ class HtmlRenderer():
# 'lexicon.article', # 'lexicon.article',
# name=self.lexicon_name, # name=self.lexicon_name,
# title=filesafe_title(span.cite_target)) # title=filesafe_title(span.cite_target))
link = f'/lexicon/{self.lexicon_name}/article/{filesafe_title(span.cite_target)}' link = (f'/lexicon/{self.lexicon_name}'
+ f'/article/{filesafe_title(span.cite_target)}')
return f'<a href="{link}"{link_class}>{"".join(span.recurse(self))}</a>' return f'<a href="{link}"{link_class}>{"".join(span.recurse(self))}</a>'
class PreviewHtmlRenderer(): class PreviewHtmlRenderer():
def __init__(self, lexicon): def __init__(self, lexicon):
with lexicon.ctx.read('info') as info: with lexicon.ctx.read('info') as info:

View File

@ -1,5 +1,7 @@
astroid==2.3.3 astroid==2.3.3
Click==7.0 Click==7.0
entrypoints==0.3
flake8==3.7.9
Flask==1.1.1 Flask==1.1.1
Flask-Login==0.4.1 Flask-Login==0.4.1
Flask-WTF==0.14.2 Flask-WTF==0.14.2
@ -9,10 +11,14 @@ Jinja2==2.10.3
lazy-object-proxy==1.4.3 lazy-object-proxy==1.4.3
MarkupSafe==1.1.1 MarkupSafe==1.1.1
mccabe==0.6.1 mccabe==0.6.1
mypy==0.770
mypy-extensions==0.4.3
pkg-resources==0.0.0 pkg-resources==0.0.0
pylint==2.4.4 pycodestyle==2.5.0
pyflakes==2.1.1
six==1.14.0 six==1.14.0
typed-ast==1.4.1 typed-ast==1.4.1
typing-extensions==3.7.4.2
Werkzeug==0.16.0 Werkzeug==0.16.0
wrapt==1.11.2 wrapt==1.11.2
WTForms==2.2.1 WTForms==2.2.1

13
tox.ini Normal file
View File

@ -0,0 +1,13 @@
[flake8]
ignore =
W191 # we use tabs here
W503 # \n before binary op
E117 # broken for tabs
E126 # tabs
E128 # tabs
exclude =
.git
__pycache__
[mypy]
ignore_missing_imports = True