Add import workflow with deflated object model

2019-12-16 21:50:26 -08:00 · 2019-12-16 21:50:26 -08:00 · ecd9d67881
commit ecd9d67881
parent 7c9c2232e2
5 changed files with 229 additions and 0 deletions
--- a/inquisitor/configs.py
+++ b/inquisitor/configs.py
@ -0,0 +1,13 @@
 import os
 import logging
 DUNGEON_PATH = os.path.abspath(os.environ.get("INQUISITOR_DUNGEON") or "./dungeon")
 SOURCES_PATH = os.path.abspath(os.environ.get("INQUISITOR_SOURCES") or "./sources")
 logger = logging.getLogger("inquisitor")
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
 formatter = logging.Formatter('[{levelname}] {message}', style="{")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
--- a/inquisitor/error.py
+++ b/inquisitor/error.py
@ -0,0 +1,22 @@
 import os
 import logging
 import json
 import random
 from configs import DUNGEON_PATH, logger
 logger = logging.getLogger("inquisitor")
 def as_item(title, body=None):
 	iid = '{:x}'.format(random.getrandbits(16 * 4))
 	item = {
 		'id': iid,
 		'source': 'inquisitor',
 		'title': title,
 	}
 	if body is not None:
 		item['body'] = '<pre>{}</pre>'.format(body)
 	path = os.path.join(DUNGEON_PATH, 'inquisitor', iid + ".item")
 	logger.error(json.dumps(item))
 	with open(path, 'w') as f:
 		f.write(json.dumps(item, indent=2))
--- a/inquisitor/importer.py
+++ b/inquisitor/importer.py
@ -0,0 +1,124 @@
 import os
 import traceback
 import importlib.util
 import json
 import error
 from configs import SOURCES_PATH, DUNGEON_PATH, logger
 import loader
 import timestamp
 def update_sources(*source_names):
 	for source_name in source_names:
 		try:
 			source_module = load_source(source_name)
 		except Exception as e:
 			error.as_item("Error importing source '{}'".format(source_name), traceback.format_exc())
 			continue
 		try:
 			logger.info("Updating source '{}'".format(source_name))
 			new_count, del_count = update_source(source_name, source_module.fetch_new)
 			logger.info("{} new item{}, {} deleted item{}".format(
 				new_count, "s" if new_count != 1 else "",
 				del_count, "s" if del_count != 1 else ""))
 		except Exception as e:
 			error.as_item("Error updating source '{}'".format(source_name), traceback.format_exc())
 def load_source(source_name):
 	"""
 	Attempts to load the source module with the given name. Raises an exception on failure.
 	"""
 	# Push the sources directory
 	cwd = os.getcwd()
 	os.chdir(SOURCES_PATH)
 	# Check if the named source is present.
 	source_file_name = source_name + ".py"
 	if not os.path.isfile(source_file_name):
 		os.chdir(cwd)
 		raise FileNotFoundError("Missing '{}' in '{}'".format(source_name, SOURCES_PATH))
 	# Try to import the source module.
 	logger.debug("Loading module {}".format(source_file_name))
 	spec = importlib.util.spec_from_file_location("itemsource", source_file_name)
 	itemsource = importlib.util.module_from_spec(spec)
 	spec.loader.exec_module(itemsource)
 	if not hasattr(itemsource, 'fetch_new'):
 		raise ImportError("Missing fetch_new in '{}'".format(source_file_name))
 	# Since the source is valid, get or create the source cell.
 	os.chdir(cwd)
 	cell_path = os.path.join(DUNGEON_PATH, source_name)
 	return itemsource
 def update_source(source_name, fetch_new):
 	"""
 	Attempts to update the given source. Raises an exception if the source does.
 	"""
 	cell_path  = os.path.join(DUNGEON_PATH, source_name)
 	# Get the existing items.
 	prior_items, errors = loader.load_items(source_name)
 	logger.debug("Found {} prior items".format(len(prior_items)))
 	# Get the new items.
 	state = loader.load_state(source_name)
 	new_items = fetch_new(state)
 	logger.debug("Fetched {} items".format(len(new_items)))
 	state.flush()
 	new_count = 0
 	del_count = 0
 	for item in new_items:
 		populate_new(item)
 		if item['id'] not in prior_items:
 			# If the item is new, write it.
 			new_count += 1
 			s = json.dumps(item)
 			path = os.path.join(DUNGEON_PATH, item['source'], item['id'])
 			with open(path, 'w', encoding="utf8") as f:
 				f.write(s)
 		else:
 			# If the item is extant and still active, overwrite its values.
 			prior_item = prior_items[item['id']]
 			if prior_item['active']:
 				populate_old(prior_item, item)
 				# Remove the id from the list to track its continued presence
 				# in the source's queue of new items.
 				del prior_items[item['id']]
 	# Any remaining extant items are considered old. Old items are removed
 	# when they are both inactive and past their ttl date.
 	now = timestamp.now()
 	for prior_id, prior_item in prior_items.items():
 		ttl_date = prior_item['created'] + prior_item['ttl']
 		if not prior_item['active'] and ttl_date < now:
 			del_count += 1
 			file_path = os.path.join(DUNGEON_PATH, prior_item['source'], prior_item['id'] + ".item")
 			os.remove(file_path)
 	# Return counts
 	return new_count, del_count
 def populate_new(item):
 	# id and source are required fields
 	item['active'] = True
 	if 'created' not in item: item['created'] = timestamp.now()
 	if 'title' not in item: item['title'] = item['id']
 	if 'link' not in item: item['link'] = None
 	if 'time' not in item: item['time'] = None
 	if 'author' not in item: item['author'] = None
 	if 'body' not in item: item['body'] = None
 	if 'tags' not in item: item['tags'] = [item['source']]
 	if 'ttl' not in item: item['ttl'] = 0
 def populate_old(prior, new):
 	prior.set({
 		'title': new['title'],
 		'link': new['link'],
 		'time': new['time'],
 		'author': new['author'],
 		'body': new['body'],
 		'tags': new['tags'],
 		'ttl': new['ttl'],
 	})
--- a/inquisitor/loader.py
+++ b/inquisitor/loader.py
@ -0,0 +1,61 @@
 import os
 import json
 from configs import DUNGEON_PATH
 import error
 class WritethroughDict():
 	"""A wrapper for a dictionary saved to the disk."""
 	def __init__(self, path):
 		if not os.path.isfile(path):
 			raise FileNotFoundError(path)
 		self.path = path
 		with open(path) as f:
 			self.item = json.loads(f.read())
 	def __getitem__(self, key):
 		return self.item[key]
 	def __setitem__(self, key, value):
 		self.item[key] = value
 		self.flush()
 	def set(self, dict):
 		for key, value in dict.items():
 			self.item[key] = value
 		self.flush()
 	def __contains__(self, key):
 		return key in self.item
 	def __repr__(self):
 		return repr(self.item)
 	def __str__(self):
 		return str(self.item)
 	def flush(self):
 		s = json.dumps(self.item, indent=2)
 		with open(self.path, 'w', encoding="utf8") as f:
 			f.write(s)
 def load_state(source_name):
 	"""Loads the state dictionary for a source."""
 	state_path = os.path.join(DUNGEON_PATH, source_name, "state")
 	return WritethroughDict(state_path)
 def load_items(source_name):
 	"""
 	Returns a map of ids to items and a list of unreadable files.
 	"""
 	cell_path = os.path.join(DUNGEON_PATH, source_name)
 	items = {}
 	errors = []
 	for filename in os.listdir(cell_path):
 		try:
 			path = os.path.join(cell_path, filename)
 			item = WritethroughDict(path)
 			items[item['id']] = item
 		except Exception as e:
 			errors.append(filename)
 	return items, errors
--- a/inquisitor/timestamp.py
+++ b/inquisitor/timestamp.py
@ -0,0 +1,9 @@
 import time
 import datetime
 def now():
 	return int(time.time())
 def stamp_to_readable(ts, formatstr="%Y-%m-%d %H:%M:%S"):
 	dt = datetime.datetime.fromtimestamp(ts)
 	return dt.strftime(formatstr)