diff --git a/.gitignore b/.gitignore index b081f05..5d670f7 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ __pycache__/ *.egg-info venv/ .vscode +.mypy_cache \ No newline at end of file diff --git a/amanuensis/parser/__init__.py b/amanuensis/parser/__init__.py index fbe8e01..b30c929 100644 --- a/amanuensis/parser/__init__.py +++ b/amanuensis/parser/__init__.py @@ -1,8 +1,18 @@ """ -Module encapsulating all markdown parsing functionality +Module encapsulating all markdown parsing functionality. """ from amanuensis.parser.analyze import FeatureCounter, GetCitations from amanuensis.parser.helpers import titlesort, filesafe_title -from amanuensis.parser.tokenizer import parse_raw_markdown -from amanuensis.parser.render import PreviewHtmlRenderer, HtmlRenderer \ No newline at end of file +from amanuensis.parser.parsing import parse_raw_markdown +from amanuensis.parser.render import PreviewHtmlRenderer, HtmlRenderer + +__all__ = [ + 'FeatureCounter', + 'GetCitations', + 'titlesort', + 'filesafe_title', + 'parse_raw_markdown', + 'PreviewHtmlRenderer', + 'HtmlRenderer', +] diff --git a/amanuensis/parser/analyze.py b/amanuensis/parser/analyze.py index 7845e17..4fa64b9 100644 --- a/amanuensis/parser/analyze.py +++ b/amanuensis/parser/analyze.py @@ -5,41 +5,22 @@ for verification against constraints. import re -class RenderableVisitor(): - """Default implementation of the visitor pattern""" - def TextSpan(self, span): - return self - def LineBreak(self, span): - return self - def ParsedArticle(self, span): - span.recurse(self) - return self - def BodyParagraph(self, span): - span.recurse(self) - return self - def SignatureParagraph(self, span): - span.recurse(self) - return self - def BoldSpan(self, span): - span.recurse(self) - return self - def ItalicSpan(self, span): - span.recurse(self) - return self - def CitationSpan(self, span): - span.recurse(self) - return self +from amanuensis.parser.core import RenderableVisitor + class GetCitations(RenderableVisitor): def __init__(self): self.citations = [] + def ParsedArticle(self, span): span.recurse(self) return self.citations + def CitationSpan(self, span): self.citations.append(span.cite_target) return self + class FeatureCounter(RenderableVisitor): def __init__(self): self.word_count = 0 @@ -47,7 +28,7 @@ class FeatureCounter(RenderableVisitor): self.has_signature = False def TextSpan(self, span): - self.word_count += len(re.split('\s+', span.innertext.strip())) + self.word_count += len(re.split(r'\s+', span.innertext.strip())) return self def SignatureParagraph(self, span): diff --git a/amanuensis/parser/core.py b/amanuensis/parser/core.py new file mode 100644 index 0000000..76f15de --- /dev/null +++ b/amanuensis/parser/core.py @@ -0,0 +1,135 @@ +""" +Internal module encapsulating the core types for parsing Lexipython +markdown. Parsed articles are represented as a hierarchy of tokens, +which can be operated on by a visitor defining functions that hook off +of the different token types. +""" + +import re +from typing import Callable, Any, Sequence + +RenderHook = Callable[['Renderable'], Any] +Spans = Sequence['Renderable'] + + +def normalize_title(title: str) -> str: + """ + Normalizes strings as titles: + - Strips leading and trailing whitespace + - Merges internal whitespace into a single space + - Capitalizes the first word + """ + cleaned = re.sub(r'\s+', " ", title.strip()) + return cleaned[:1].capitalize() + cleaned[1:] + + +class Renderable(): + """ + Base class for parsed markdown. Provides the `render()` method for + visiting the token tree. + """ + def render(self: 'Renderable', renderer: 'RenderableVisitor'): + """ + Execute the apppropriate visitor method on this Renderable. + """ + hook: RenderHook = getattr(renderer, type(self).__name__, None) + if hook: + return hook(self) + return None + + +class TextSpan(Renderable): + """An unstyled length of text.""" + def __init__(self, innertext: str): + self.innertext = innertext + + def __str__(self): + return f"[{self.innertext}]" + + +class LineBreak(Renderable): + """A line break within a paragraph.""" + def __str__(self): + return "" + + +class SpanContainer(Renderable): + """A formatting element that wraps some amount of text.""" + def __init__(self, spans: Spans): + self.spans: Spans = spans + + def __str__(self): + return (f'[{type(self).__name__} ' + + f'{" ".join([str(span) for span in self.spans])}]') + + def recurse(self, renderer: 'RenderableVisitor'): + return [child.render(renderer) for child in self.spans] + + +class ParsedArticle(SpanContainer): + """Token tree root node, containing some number of paragraph tokens.""" + + +class BodyParagraph(SpanContainer): + """A normal paragraph.""" + + +class SignatureParagraph(SpanContainer): + """A paragraph preceded by a signature mark.""" + + +class BoldSpan(SpanContainer): + """A span of text inside bold marks.""" + + +class ItalicSpan(SpanContainer): + """A span of text inside italic marks.""" + + +class CitationSpan(SpanContainer): + """A citation to another article.""" + def __init__(self, spans: Spans, cite_target: str): + super().__init__(spans) + # Normalize citation target on parse, since we don't want + # abnormal title strings lying around causing trouble. + self.cite_target: str = normalize_title(cite_target) + + def __str__(self): + return (f'{{{" ".join([str(span) for span in self.spans])}' + + f':{self.cite_target}}}') + + +class RenderableVisitor(): + """ + Default implementation of the visitor pattern. Executes once on + each token in the tree and returns itself. + """ + def TextSpan(self, span: TextSpan): + return self + + def LineBreak(self, span: LineBreak): + return self + + def ParsedArticle(self, span: ParsedArticle): + span.recurse(self) + return self + + def BodyParagraph(self, span: BodyParagraph): + span.recurse(self) + return self + + def SignatureParagraph(self, span: SignatureParagraph): + span.recurse(self) + return self + + def BoldSpan(self, span: BoldSpan): + span.recurse(self) + return self + + def ItalicSpan(self, span: ItalicSpan): + span.recurse(self) + return self + + def CitationSpan(self, span: CitationSpan): + span.recurse(self) + return self diff --git a/amanuensis/parser/helpers.py b/amanuensis/parser/helpers.py index ccc6018..e6eabfb 100644 --- a/amanuensis/parser/helpers.py +++ b/amanuensis/parser/helpers.py @@ -1,31 +1,22 @@ import re -import urllib +import urllib.parse -def normalize_title(title): - """ - Normalizes strings as titles: - - Strips leading and trailing whitespace - - Merges internal whitespace into a single space - - Capitalizes the first word - """ - cleaned = re.sub(r'\s+', " ", title.strip()) - return cleaned[:1].capitalize() + cleaned[1:] -def titlesort(title): +def titlesort(title: str) -> str: """ Strips articles off of titles for alphabetical sorting purposes """ lower = title.lower() if lower.startswith("the "): return lower[4:] - elif lower.startswith("an "): + if lower.startswith("an "): return lower[3:] - elif lower.startswith("a "): + if lower.startswith("a "): return lower[2:] - else: - return lower + return lower -def filesafe_title(title): + +def filesafe_title(title: str) -> str: """ Makes an article title filename-safe. """ @@ -34,4 +25,4 @@ def filesafe_title(title): s = urllib.parse.quote(s) # Encode all other characters s = re.sub(r"%", "", s) # Strip encoding %s s = s[:64] # Limit to 64 characters - return s \ No newline at end of file + return s diff --git a/amanuensis/parser/tokenizer.py b/amanuensis/parser/parsing.py similarity index 58% rename from amanuensis/parser/tokenizer.py rename to amanuensis/parser/parsing.py index 92cfd3a..1e68aa1 100644 --- a/amanuensis/parser/tokenizer.py +++ b/amanuensis/parser/parsing.py @@ -1,74 +1,39 @@ """ -Internal module encapsulating the parsing logic for Lexipython -markdown. Parse results are represented as a hierarchy of tokens, which -can be rendered by a renderer. +Internal module encapsulating a recursive descent parser for +Lexipython markdown. """ import re +from typing import Sequence -from amanuensis.parser.helpers import normalize_title +from amanuensis.parser.core import ( + TextSpan, + LineBreak, + ParsedArticle, + BodyParagraph, + SignatureParagraph, + BoldSpan, + ItalicSpan, + CitationSpan, + Renderable, + SpanContainer +) -class Renderable(): - def render(self, renderer): - hook = getattr(renderer, type(self).__name__, None) - if hook: - return hook(self) - return None - -class TextSpan(Renderable): - """An unstyled length of text""" - def __init__(self, innertext): - self.innertext = innertext - def __str__(self): - return f"[{self.innertext}]" - -class LineBreak(Renderable): - """A line break within a paragraph""" - def __str__(self): - return "" - -class SpanContainer(Renderable): - """A formatting element that wraps some amount of text""" - def __init__(self, spans): - self.spans = spans - def __str__(self): - return f"[{type(self).__name__} {' '.join([str(span) for span in self.spans])}]" - def recurse(self, renderer): - return [child.render(renderer) for child in self.spans] - -class ParsedArticle(SpanContainer): - """Multiple paragraphs""" - -class BodyParagraph(SpanContainer): - """A normal paragraph""" - -class SignatureParagraph(SpanContainer): - """A paragraph preceded by a signature mark""" - -class BoldSpan(SpanContainer): - """A span of text inside bold marks""" - -class ItalicSpan(SpanContainer): - """A span of text inside italic marks""" - -class CitationSpan(SpanContainer): - """A citation to another article""" - def __init__(self, spans, cite_target): - super().__init__(spans) - # Normalize citation target - self.cite_target = normalize_title(cite_target) - def __str__(self): - return f"{{{' '.join([str(span) for span in self.spans])}:{self.cite_target}}}" +Spans = Sequence[Renderable] -def parse_raw_markdown(text): +def parse_raw_markdown(text: str) -> ParsedArticle: + """ + Parses a body of Lexipython markdown into a Renderable tree. + """ # Parse each paragraph individually, as no formatting applies # across paragraphs paragraphs = re.split(r'\n\n+', text) parse_results = list(map(parse_paragraph, paragraphs)) return ParsedArticle(parse_results) -def parse_paragraph(text): + +def parse_paragraph(text: str) -> SpanContainer: # Parse the paragraph as a span of text text = text.strip() if text and text[0] == '~': @@ -76,7 +41,12 @@ def parse_paragraph(text): else: return BodyParagraph(parse_paired_formatting(text)) -def parse_paired_formatting(text, cite=True, bold=True, italic=True): + +def parse_paired_formatting( + text: str, + cite: bool = True, + bold: bool = True, + italic: bool = True) -> Spans: # Find positions of any paired formatting first_cite = find_pair(text, "[[", "]]", cite) first_bold = find_pair(text, "**", "**", bold) @@ -93,7 +63,12 @@ def parse_paired_formatting(text, cite=True, bold=True, italic=True): first = min(finds) if finds else -1 return handlers[first]() -def find_pair(text, open_tag, close_tag, valid): + +def find_pair( + text: str, + open_tag: str, + close_tag: str, + valid: bool) -> int: # If skipping, return -1 if not valid: return -1 @@ -108,7 +83,8 @@ def find_pair(text, open_tag, close_tag, valid): # Otherwise, the pair exists return first -def parse_citation(text, bold=True, italic=True): + +def parse_citation(text: str, bold: bool = True, italic: bool = True) -> Spans: cite_open = text.find("[[") if cite_open > -1: cite_close = text.find("]]", cite_open + 2) @@ -128,50 +104,53 @@ def parse_citation(text, bold=True, italic=True): spans_inner = parse_paired_formatting(text_inner_actual, cite=False, bold=bold, italic=italic) citation = CitationSpan(spans_inner, cite_target) - return spans_before + [citation] + spans_after + return [*spans_before, citation, *spans_after] # Should never happen return parse_breaks(text) -def parse_bold(text, cite=True, italic=True): + +def parse_bold(text: str, cite: bool = True, italic: bool = True) -> Spans: bold_open = text.find("**") if bold_open > -1: bold_close = text.find("**", bold_open + 2) # Should be no formatting behind us spans_before = parse_breaks(text[:bold_open]) # Freely parse formatting after us - spans_after = parse_paired_formatting(text[bold_close+2:]) + spans_after = parse_paired_formatting(text[bold_close + 2:]) # Parse inner text minus bold parsing - text_inner = text[bold_open+2:bold_close] + text_inner = text[bold_open + 2:bold_close] spans_inner = parse_paired_formatting(text_inner, cite=cite, bold=False, italic=italic) bold = BoldSpan(spans_inner) - return spans_before + [bold] + spans_after + return [*spans_before, bold, *spans_after] # Should never happen return parse_italic(text) -def parse_italic(text, cite=True, bold=True): + +def parse_italic(text: str, cite: bool = True, bold: bool = True) -> Spans: italic_open = text.find("//") if italic_open > -1: italic_close = text.find("//", italic_open + 2) # Should be no formatting behind us spans_before = parse_breaks(text[:italic_open]) # Freely parse formatting after us - spans_after = parse_paired_formatting(text[italic_close+2:]) + spans_after = parse_paired_formatting(text[italic_close + 2:]) # Parse inner text minus italic parsing - text_inner = text[italic_open+2:italic_close] + text_inner = text[italic_open + 2:italic_close] spans_inner = parse_paired_formatting(text_inner, cite=cite, bold=bold, italic=False) italic = ItalicSpan(spans_inner) - return spans_before + [italic] + spans_after + return [*spans_before, italic, *spans_after] # Should never happen return parse_breaks(text) -def parse_breaks(text): + +def parse_breaks(text: str) -> Spans: if not text: return [] - splits = list(map(TextSpan, text.split("\\\\\n"))) - spans = [splits[0]] - for span in splits[1:]: - spans.append(LineBreak()) - spans.append(span) + splits: Spans = list(map(TextSpan, text.split("\\\\\n"))) + spans: Spans = [ + splits[i // 2] if i % 2 == 0 else LineBreak() + for i in range(0, 2 * len(splits) - 1) + ] return spans diff --git a/amanuensis/parser/render.py b/amanuensis/parser/render.py index 4ec5afd..9cbc72c 100644 --- a/amanuensis/parser/render.py +++ b/amanuensis/parser/render.py @@ -3,7 +3,7 @@ Internal module encapsulating visitors that render articles into readable formats. """ -from flask import url_for +from typing import Iterable from amanuensis.parser.helpers import filesafe_title @@ -12,9 +12,9 @@ class HtmlRenderer(): """ Renders an article token tree into published article HTML. """ - def __init__(self, lexicon_name, written_articles): - self.lexicon_name = lexicon_name - self.written_articles = written_articles + def __init__(self, lexicon_name: str, written_articles: Iterable[str]): + self.lexicon_name: str = lexicon_name + self.written_articles: Iterable[str] = written_articles def TextSpan(self, span): return span.innertext @@ -50,11 +50,11 @@ class HtmlRenderer(): # 'lexicon.article', # name=self.lexicon_name, # title=filesafe_title(span.cite_target)) - link = f'/lexicon/{self.lexicon_name}/article/{filesafe_title(span.cite_target)}' + link = (f'/lexicon/{self.lexicon_name}' + + f'/article/{filesafe_title(span.cite_target)}') return f'{"".join(span.recurse(self))}' - class PreviewHtmlRenderer(): def __init__(self, lexicon): with lexicon.ctx.read('info') as info: diff --git a/requirements.txt b/requirements.txt index 2e1e2f3..0bd9c48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ astroid==2.3.3 Click==7.0 +entrypoints==0.3 +flake8==3.7.9 Flask==1.1.1 Flask-Login==0.4.1 Flask-WTF==0.14.2 @@ -9,10 +11,14 @@ Jinja2==2.10.3 lazy-object-proxy==1.4.3 MarkupSafe==1.1.1 mccabe==0.6.1 +mypy==0.770 +mypy-extensions==0.4.3 pkg-resources==0.0.0 -pylint==2.4.4 +pycodestyle==2.5.0 +pyflakes==2.1.1 six==1.14.0 typed-ast==1.4.1 +typing-extensions==3.7.4.2 Werkzeug==0.16.0 wrapt==1.11.2 WTForms==2.2.1 diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..3137e31 --- /dev/null +++ b/tox.ini @@ -0,0 +1,13 @@ +[flake8] +ignore = + W191 # we use tabs here + W503 # \n before binary op + E117 # broken for tabs + E126 # tabs + E128 # tabs +exclude = + .git + __pycache__ + +[mypy] +ignore_missing_imports = True \ No newline at end of file