From f7688429a8215eaee690bd3b2324b1d1164eae03 Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Wed, 4 Mar 2020 22:28:09 -0800 Subject: [PATCH] Add parser module --- amanuensis/parser/__init__.py | 6 ++ amanuensis/parser/render.py | 25 +++++++ amanuensis/parser/text.py | 118 ++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 amanuensis/parser/__init__.py create mode 100644 amanuensis/parser/render.py create mode 100644 amanuensis/parser/text.py diff --git a/amanuensis/parser/__init__.py b/amanuensis/parser/__init__.py new file mode 100644 index 0000000..a4bccb3 --- /dev/null +++ b/amanuensis/parser/__init__.py @@ -0,0 +1,6 @@ +""" +Module encapsulating all markdown parsing functionality +""" + +from amanuensis.parser.text import parse_raw_markdown +from amanuensis.parser.render import PreviewHtmlRenderer \ No newline at end of file diff --git a/amanuensis/parser/render.py b/amanuensis/parser/render.py new file mode 100644 index 0000000..37bc91a --- /dev/null +++ b/amanuensis/parser/render.py @@ -0,0 +1,25 @@ +""" +Internal module encapsulating the render logic for parsed articles. Rendering +is done via a rough approximation of the visitor pattern. +""" + + +class PreviewHtmlRenderer(): + def TextSpan(self, span): + return span.innertext + def LineBreak(self, span): + return '
' + def ParsedArticle(self, span): + return '\n'.join([child.render(self) for child in span.spans]) + def BodyParagraph(self, span): + return f'

{"".join([child.render(self) for child in span.spans])}

' + def SignatureParagraph(self, span): + return ('

' + f'{"".join([child.render(self) for child in span.spans])}' + '

') + def BoldSpan(self, span): + return f'{"".join([child.render(self) for child in span.spans])}' + def ItalicSpan(self, span): + return f'{"".join([child.render(self) for child in span.spans])}' + def CitationSpan(self, span): + return f'{span.cite_text}' diff --git a/amanuensis/parser/text.py b/amanuensis/parser/text.py new file mode 100644 index 0000000..45f8868 --- /dev/null +++ b/amanuensis/parser/text.py @@ -0,0 +1,118 @@ +""" +Internal module encapsulating the parsing logic for Lexipython +markdown. Parse results are represented as a hierarchy of tokens, which +can be rendered by a renderer. +""" + +import re + + +class Renderable(): + def render(self, renderer): + return getattr(renderer, type(self).__name__)(self) + +class TextSpan(Renderable): + """An unstyled length of text""" + def __init__(self, innertext): + self.innertext = innertext + def __str__(self): + return f"[{self.innertext}]" + +class LineBreak(Renderable): + """A line break within a paragraph""" + def __str__(self): + return "" + +class SpanContainer(Renderable): + """A formatting element that wraps some amount of text""" + def __init__(self, spans): + self.spans = spans + def __str__(self): + return f"[{type(self).__name__} {' '.join([str(span) for span in self.spans])}]" + +class ParsedArticle(SpanContainer): + """Multiple paragraphs""" + +class BodyParagraph(SpanContainer): + """A normal paragraph""" + +class SignatureParagraph(SpanContainer): + """A paragraph preceded by a signature mark""" + +class BoldSpan(SpanContainer): + """A span of text inside bold marks""" + +class ItalicSpan(SpanContainer): + """A span of text inside italic marks""" + +class CitationSpan(Renderable): + """A citation to another article""" + def __init__(self, cite_text, cite_target): + self.cite_text = cite_text + self.cite_target = cite_target + def __str__(self): + return f"{{{self.cite_text}:{self.cite_target}}}" + + +def parse_raw_markdown(text): + # Parse each paragraph individually, as no formatting applies + # across paragraphs + paragraphs = re.split(r'\n\n+', text) + parse_results = list(map(parse_paragraph, paragraphs)) + return ParsedArticle(parse_results) + +def parse_paragraph(text): + # Parse the paragraph as a span of text + text = text.strip() + if text and text[0] == '~': + return SignatureParagraph(parse_citations(text[1:])) + else: + return BodyParagraph(parse_citations(text)) + +def parse_citations(text): + cite_open = text.find("[[") + if cite_open > -1: + cite_close = text.find("]]", cite_open + 2) + spans_before = parse_bold(text[:cite_open]) + spans_after = parse_citations(text[cite_close+2:]) + text_inner = text[cite_open+2:cite_close] + alias_split = text_inner.split("|", 1) + citation = CitationSpan(alias_split[0], alias_split[-1]) + return spans_before + [citation] + spans_after + # No citations, just parse the regular formatting + return parse_bold(text) + +def parse_bold(text): + bold_open = text.find("**") + if bold_open > -1: + bold_close = text.find("**", bold_open + 2) + spans_before = parse_italic(text[:bold_open]) + spans_after = parse_bold(text[bold_close+2:]) + spans_inner = parse_italic(text[bold_open+2:bold_close]) + bold = BoldSpan(spans_inner) + return spans_before + [bold] + spans_after + return parse_italic(text) + +def parse_italic(text): + italic_open = text.find("//") + if italic_open > -1: + italic_close = text.find("//", italic_open + 2) + text_before = text[:italic_open] + text_inner = text[italic_open+2:italic_close] + text_after = text[italic_close+2:] + spans_before = parse_breaks(text_before) + spans_after = parse_italic(text_after) + spans_inner = parse_breaks(text_inner) + italic = ItalicSpan(spans_inner) + return spans_before + [italic] + spans_after + return parse_breaks(text) + +def parse_breaks(text): + if not text: + return [] + splits = list(map(TextSpan, text.split("\\\\\n"))) + spans = [splits[0]] + for span in splits[1:]: + spans.append(LineBreak()) + spans.append(span) + return spans