From ecd9d67881d633834f31abc6de2db4a6cbaf7df7 Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Mon, 16 Dec 2019 21:50:26 -0800 Subject: [PATCH] Add import workflow with deflated object model --- inquisitor/configs.py | 13 +++++ inquisitor/error.py | 22 +++++++ inquisitor/importer.py | 124 ++++++++++++++++++++++++++++++++++++++++ inquisitor/loader.py | 61 ++++++++++++++++++++ inquisitor/timestamp.py | 9 +++ 5 files changed, 229 insertions(+) create mode 100644 inquisitor/configs.py create mode 100644 inquisitor/error.py create mode 100644 inquisitor/importer.py create mode 100644 inquisitor/loader.py create mode 100644 inquisitor/timestamp.py diff --git a/inquisitor/configs.py b/inquisitor/configs.py new file mode 100644 index 0000000..38fc850 --- /dev/null +++ b/inquisitor/configs.py @@ -0,0 +1,13 @@ +import os +import logging + +DUNGEON_PATH = os.path.abspath(os.environ.get("INQUISITOR_DUNGEON") or "./dungeon") +SOURCES_PATH = os.path.abspath(os.environ.get("INQUISITOR_SOURCES") or "./sources") + +logger = logging.getLogger("inquisitor") +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('[{levelname}] {message}', style="{") +handler.setFormatter(formatter) +logger.addHandler(handler) diff --git a/inquisitor/error.py b/inquisitor/error.py new file mode 100644 index 0000000..d8f661c --- /dev/null +++ b/inquisitor/error.py @@ -0,0 +1,22 @@ +import os +import logging +import json +import random + +from configs import DUNGEON_PATH, logger + +logger = logging.getLogger("inquisitor") + +def as_item(title, body=None): + iid = '{:x}'.format(random.getrandbits(16 * 4)) + item = { + 'id': iid, + 'source': 'inquisitor', + 'title': title, + } + if body is not None: + item['body'] = '
{}
'.format(body) + path = os.path.join(DUNGEON_PATH, 'inquisitor', iid + ".item") + logger.error(json.dumps(item)) + with open(path, 'w') as f: + f.write(json.dumps(item, indent=2)) diff --git a/inquisitor/importer.py b/inquisitor/importer.py new file mode 100644 index 0000000..62f025e --- /dev/null +++ b/inquisitor/importer.py @@ -0,0 +1,124 @@ +import os +import traceback +import importlib.util +import json + +import error +from configs import SOURCES_PATH, DUNGEON_PATH, logger +import loader +import timestamp + +def update_sources(*source_names): + for source_name in source_names: + try: + source_module = load_source(source_name) + except Exception as e: + error.as_item("Error importing source '{}'".format(source_name), traceback.format_exc()) + continue + + try: + logger.info("Updating source '{}'".format(source_name)) + new_count, del_count = update_source(source_name, source_module.fetch_new) + logger.info("{} new item{}, {} deleted item{}".format( + new_count, "s" if new_count != 1 else "", + del_count, "s" if del_count != 1 else "")) + except Exception as e: + error.as_item("Error updating source '{}'".format(source_name), traceback.format_exc()) + +def load_source(source_name): + """ + Attempts to load the source module with the given name. Raises an exception on failure. + """ + # Push the sources directory + cwd = os.getcwd() + os.chdir(SOURCES_PATH) + # Check if the named source is present. + source_file_name = source_name + ".py" + if not os.path.isfile(source_file_name): + os.chdir(cwd) + raise FileNotFoundError("Missing '{}' in '{}'".format(source_name, SOURCES_PATH)) + # Try to import the source module. + logger.debug("Loading module {}".format(source_file_name)) + spec = importlib.util.spec_from_file_location("itemsource", source_file_name) + itemsource = importlib.util.module_from_spec(spec) + spec.loader.exec_module(itemsource) + if not hasattr(itemsource, 'fetch_new'): + raise ImportError("Missing fetch_new in '{}'".format(source_file_name)) + # Since the source is valid, get or create the source cell. + os.chdir(cwd) + cell_path = os.path.join(DUNGEON_PATH, source_name) + return itemsource + +def update_source(source_name, fetch_new): + """ + Attempts to update the given source. Raises an exception if the source does. + """ + cell_path = os.path.join(DUNGEON_PATH, source_name) + + # Get the existing items. + prior_items, errors = loader.load_items(source_name) + logger.debug("Found {} prior items".format(len(prior_items))) + + # Get the new items. + state = loader.load_state(source_name) + new_items = fetch_new(state) + logger.debug("Fetched {} items".format(len(new_items))) + state.flush() + + new_count = 0 + del_count = 0 + for item in new_items: + populate_new(item) + + if item['id'] not in prior_items: + # If the item is new, write it. + new_count += 1 + s = json.dumps(item) + path = os.path.join(DUNGEON_PATH, item['source'], item['id']) + with open(path, 'w', encoding="utf8") as f: + f.write(s) + + else: + # If the item is extant and still active, overwrite its values. + prior_item = prior_items[item['id']] + if prior_item['active']: + populate_old(prior_item, item) + # Remove the id from the list to track its continued presence + # in the source's queue of new items. + del prior_items[item['id']] + + # Any remaining extant items are considered old. Old items are removed + # when they are both inactive and past their ttl date. + now = timestamp.now() + for prior_id, prior_item in prior_items.items(): + ttl_date = prior_item['created'] + prior_item['ttl'] + if not prior_item['active'] and ttl_date < now: + del_count += 1 + file_path = os.path.join(DUNGEON_PATH, prior_item['source'], prior_item['id'] + ".item") + os.remove(file_path) + + # Return counts + return new_count, del_count + +def populate_new(item): + # id and source are required fields + item['active'] = True + if 'created' not in item: item['created'] = timestamp.now() + if 'title' not in item: item['title'] = item['id'] + if 'link' not in item: item['link'] = None + if 'time' not in item: item['time'] = None + if 'author' not in item: item['author'] = None + if 'body' not in item: item['body'] = None + if 'tags' not in item: item['tags'] = [item['source']] + if 'ttl' not in item: item['ttl'] = 0 + +def populate_old(prior, new): + prior.set({ + 'title': new['title'], + 'link': new['link'], + 'time': new['time'], + 'author': new['author'], + 'body': new['body'], + 'tags': new['tags'], + 'ttl': new['ttl'], + }) diff --git a/inquisitor/loader.py b/inquisitor/loader.py new file mode 100644 index 0000000..4ca3e3f --- /dev/null +++ b/inquisitor/loader.py @@ -0,0 +1,61 @@ +import os +import json + +from configs import DUNGEON_PATH +import error + +class WritethroughDict(): + """A wrapper for a dictionary saved to the disk.""" + def __init__(self, path): + if not os.path.isfile(path): + raise FileNotFoundError(path) + self.path = path + with open(path) as f: + self.item = json.loads(f.read()) + + def __getitem__(self, key): + return self.item[key] + + def __setitem__(self, key, value): + self.item[key] = value + self.flush() + + def set(self, dict): + for key, value in dict.items(): + self.item[key] = value + self.flush() + + def __contains__(self, key): + return key in self.item + + def __repr__(self): + return repr(self.item) + + def __str__(self): + return str(self.item) + + def flush(self): + s = json.dumps(self.item, indent=2) + with open(self.path, 'w', encoding="utf8") as f: + f.write(s) + +def load_state(source_name): + """Loads the state dictionary for a source.""" + state_path = os.path.join(DUNGEON_PATH, source_name, "state") + return WritethroughDict(state_path) + +def load_items(source_name): + """ + Returns a map of ids to items and a list of unreadable files. + """ + cell_path = os.path.join(DUNGEON_PATH, source_name) + items = {} + errors = [] + for filename in os.listdir(cell_path): + try: + path = os.path.join(cell_path, filename) + item = WritethroughDict(path) + items[item['id']] = item + except Exception as e: + errors.append(filename) + return items, errors \ No newline at end of file diff --git a/inquisitor/timestamp.py b/inquisitor/timestamp.py new file mode 100644 index 0000000..73a633b --- /dev/null +++ b/inquisitor/timestamp.py @@ -0,0 +1,9 @@ +import time +import datetime + +def now(): + return int(time.time()) + +def stamp_to_readable(ts, formatstr="%Y-%m-%d %H:%M:%S"): + dt = datetime.datetime.fromtimestamp(ts) + return dt.strftime(formatstr) \ No newline at end of file