From 49ed06fcb0b128b655fd23a0accaedf50a362f9e Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Sat, 11 Apr 2020 20:45:16 -0700 Subject: [PATCH] Refactor title normalization --- amanuensis/parser/helpers.py | 23 +++++++++++++++++++++++ amanuensis/parser/tokenizer.py | 5 +++-- 2 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 amanuensis/parser/helpers.py diff --git a/amanuensis/parser/helpers.py b/amanuensis/parser/helpers.py new file mode 100644 index 0000000..22b27a1 --- /dev/null +++ b/amanuensis/parser/helpers.py @@ -0,0 +1,23 @@ +def normalize_title(title): + """ + Normalizes strings as titles: + - Strips leading and trailing whitespace + - Merges internal whitespace into a single space + - Capitalizes the first word + """ + cleaned = re.sub(r'\s+', " ", title.strip()) + return cleaned[0:1].upper() + cleaned[1:] + +def titlesort(title): + """ + Strips articles off of titles for alphabetical sorting purposes + """ + lower = title.lower() + if lower.startswith("the "): + return lower[4:] + elif lower.startswith("an "): + return lower[3:] + elif lower.startswith("a "): + return lower[2:] + else: + return lower \ No newline at end of file diff --git a/amanuensis/parser/tokenizer.py b/amanuensis/parser/tokenizer.py index ca8f6b3..9f07fff 100644 --- a/amanuensis/parser/tokenizer.py +++ b/amanuensis/parser/tokenizer.py @@ -6,6 +6,7 @@ can be rendered by a renderer. import re +from amanuensis.parser.helpers import normalize_title class Renderable(): def render(self, renderer): @@ -54,8 +55,8 @@ class CitationSpan(SpanContainer): """A citation to another article""" def __init__(self, spans, cite_target): super().__init__(spans) - # Normalize citation target by eliminating most whitespace - self.cite_target = re.sub(r'\s+', " ", cite_target.strip()) + # Normalize citation target + self.cite_target = normalize_title(cite_target) def __str__(self): return f"{{{' '.join([str(span) for span in self.spans])}:{self.cite_target}}}"