Refactor title normalization

This commit is contained in:
Tim Van Baak 2020-04-11 20:45:16 -07:00
parent 3832a18d63
commit 49ed06fcb0
2 changed files with 26 additions and 2 deletions

View File

@ -0,0 +1,23 @@
def normalize_title(title):
"""
Normalizes strings as titles:
- Strips leading and trailing whitespace
- Merges internal whitespace into a single space
- Capitalizes the first word
"""
cleaned = re.sub(r'\s+', " ", title.strip())
return cleaned[0:1].upper() + cleaned[1:]
def titlesort(title):
"""
Strips articles off of titles for alphabetical sorting purposes
"""
lower = title.lower()
if lower.startswith("the "):
return lower[4:]
elif lower.startswith("an "):
return lower[3:]
elif lower.startswith("a "):
return lower[2:]
else:
return lower

View File

@ -6,6 +6,7 @@ can be rendered by a renderer.
import re
from amanuensis.parser.helpers import normalize_title
class Renderable():
def render(self, renderer):
@ -54,8 +55,8 @@ class CitationSpan(SpanContainer):
"""A citation to another article"""
def __init__(self, spans, cite_target):
super().__init__(spans)
# Normalize citation target by eliminating most whitespace
self.cite_target = re.sub(r'\s+', " ", cite_target.strip())
# Normalize citation target
self.cite_target = normalize_title(cite_target)
def __str__(self):
return f"{{{' '.join([str(span) for span in self.spans])}:{self.cite_target}}}"