From c116476487d86cc8c20ad09495532a6134dbae8c Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Thu, 6 Aug 2020 23:12:59 -0700 Subject: [PATCH] Clean up source update code --- inquisitor/cli.py | 31 ++++-------- inquisitor/loader.py | 59 ++++++++++++++++++++-- inquisitor/sources.py | 115 +++++++++++++++++++----------------------- 3 files changed, 117 insertions(+), 88 deletions(-) diff --git a/inquisitor/cli.py b/inquisitor/cli.py index 0e46c3e..4a0e385 100644 --- a/inquisitor/cli.py +++ b/inquisitor/cli.py @@ -107,23 +107,17 @@ def command_add(args): source = args.source or 'inquisitor' cell_path = os.path.join(DUNGEON_PATH, source) - if not os.path.isdir(cell_path): - if args.create: - os.mkdir(cell_path) - state_path = os.path.join(cell_path, "state") - with open(state_path, 'w', encoding='utf8') as f: - f.write(json.dumps({})) - else: - logger.error("Source '{}' does not exist".format(source)) - return -1 + if args.create: + from inquisitor.sources import ensure_cell + ensure_cell(source) + elif not os.path.isdir(cell_path): + logger.error("Source '{}' does not exist".format(source)) + return -1 - from inquisitor.sources import populate_new item = { - 'id': '{:x}'.format(random.getrandbits(16 * 4)), - 'source': 'inquisitor' + 'id': args.id or '{:x}'.format(random.getrandbits(16 * 4)), + 'source': source, } - if args.id: item['id'] = str(args.id) - if args.source: item['source'] = str(args.source) if args.title: item['title'] = str(args.title) if args.link: item['link'] = str(args.link) if args.time: item['time'] = int(args.time) @@ -133,13 +127,10 @@ def command_add(args): if args.ttl: item['ttl'] = int(args.ttl) if args.ttd: item['ttd'] = int(args.ttd) if args.tts: item['tts'] = int(args.tts) - populate_new(item['source'], item) - s = json.dumps(item, indent=2) - path = os.path.join(DUNGEON_PATH, item['source'], item['id'] + '.item') - with open(path, 'w', encoding='utf8') as f: - f.write(s) - logger.info(item) + from inquisitor.loader import new_item + saved_item = new_item(source, item) + logger.info(saved_item) def command_feed(args): diff --git a/inquisitor/loader.py b/inquisitor/loader.py index b7e4129..4045250 100644 --- a/inquisitor/loader.py +++ b/inquisitor/loader.py @@ -45,11 +45,6 @@ class WritethroughDict(): self.item[key] = value self.flush() - def set(self, dict): - for key, value in dict.items(): - self.item[key] = value - self.flush() - def __contains__(self, key): return key in self.item @@ -76,6 +71,60 @@ def load_item(source_name, item_id): item_path = os.path.join(DUNGEON_PATH, source_name, f'{item_id}.item') return WritethroughDict.load(item_path) + +def item_exists(source_name, item_id): + """ + Checks for the existence of an item. + """ + item_path = os.path.join(DUNGEON_PATH, source_name, f'{item_id}.item') + return os.path.isfile(item_path) + + +def get_item_ids(cell_name): + """ + Returns a list of item ids in the given cell. + """ + cell_path = os.path.join(DUNGEON_PATH, cell_name) + return [ + filename[:-5] + for filename in os.listdir(cell_path) + if filename.endswith('.item') + ] + + +def new_item(source_name, item): + """ + Creates a new item with the fields in the provided dictionary. + Initializes other fields to their default values. + """ + # id is required + if 'id' not in item: + raise Exception(f'Cannot create item with no id. Value = {item}') + + # source must be filled in, so if it is absent it is auto-populated with + # source_name. Note: this allows sources to fill in a different source. + if 'source' not in item: + item['source'] = source_name + + # active is forced to True for new items + item['active'] = True + + # created is forced to the current timestamp + item['created'] = timestamp.now() + + # title is auto-populated with the id if missing + if 'title' not in item: + item['title'] = item['id'] + + # tags is auto-populated if missing (not if empty!) + if 'tags' not in item: + item['tags'] = [source_name] + + # All other fields are optional. + item_path = os.path.join(DUNGEON_PATH, item['source'], f'{item["id"]}.item') + return WritethroughDict.create(item_path, item) + + def load_items(source_name): """ Returns a map of ids to items and a list of unreadable files. diff --git a/inquisitor/sources.py b/inquisitor/sources.py index dd50229..508a5b9 100644 --- a/inquisitor/sources.py +++ b/inquisitor/sources.py @@ -9,6 +9,19 @@ from inquisitor import loader, timestamp, error from inquisitor.configs import SOURCES_PATH, DUNGEON_PATH, logger +USE_NEWEST = ( + 'title', + 'tags', + 'link', + 'time' + 'author', + 'body', + 'ttl', + 'ttd', + 'tts', +) + + def ensure_cell(name): """ Creates a cell in the dungeon. Idempotent. @@ -92,54 +105,59 @@ def update_source(source_name, fetch_new): """ Attempts to update the given source. Raises an exception if the source does. """ - # Get the existing items from the source's cell. - prior_items, errors = loader.load_items(source_name) - if any(errors): - raise Exception(f'Can\'t update source "{source_name}", some items are corrupt') - logger.debug("Found {} prior items".format(len(prior_items))) + # Get a list of item ids that already existed in this source's cell. + prior_ids = loader.get_item_ids(source_name) + logger.debug(f'Found {len(prior_ids)} prior items') # Get the feed items from the source's fetch method. state = loader.load_state(source_name) fetched = fetch_new(state) - fetched_items = {item['id']: item for item in fetched} state.flush() + logger.debug(f'Fetched {len(fetched)} items') + fetched_items = {item['id']: item for item in fetched} - # Populate all the fetched items with required or auto-generated fields. - # This also provides an opportunity to throw if the source isn't returning - # valid items. - for item in fetched_items.values(): - populate_new(source_name, item) - logger.debug("Fetched {} items".format(len(fetched_items))) + # Determine which items are new and which are updates. + # We query the file system here instead of checking against this source's + # item ids from above because sources are allowed to generate in other + # sources' cells. + new_items = [] + updated_items = [] + for item in fetched: + item_source = item.get('source', source_name) + if loader.item_exists(item_source, item['id']): + updated_items.append(item) + else: + new_items.append(item) - # Write all the new fetched items to the source's cell. - new_items = [ - item for item in fetched_items.values() - if item['id'] not in prior_items] + # Write all the new items to the source's cell. for item in new_items: - s = json.dumps(item) - path = os.path.join(DUNGEON_PATH, item['source'], item['id'] + ".item") - with open(path, 'w', encoding='utf8') as f: - f.write(s) + item_source = item.get('source', source_name) + loader.new_item(item_source, item) - # Update the extant items using the fetched item's values. - extant_items = [ - item for item in fetched_items.values() - if item['id'] in prior_items] - for item in extant_items: - # The items in prior_items are writethrough dicts. - prior_item = prior_items[item['id']] - # Only bother updating active items. - if prior_item['active']: - populate_old(prior_item, item) + # Update the other items using the fetched items' values. + for new_item in updated_items: + old_item = loader.load_item(new_item['source'], new_item['id']) + for field in USE_NEWEST: + if field in new_item and old_item[field] != new_item[field]: + old_item[field] = new_item[field] + if 'callback' in new_item: + old_callback = old_item.get('callback', {}) + # Because of the way this update happens, any fields that are set + # in the callback when the item is new will keep their original + # values, as those values reappear in new_item on subsequent + # updates. + old_item['callback'] = {**old_item['callback'], **new_item['callback']} # In general, items are removed when they are old (not found in the last # fetch) and inactive. Some item fields can change this basic behavior. del_count = 0 now = timestamp.now() - old_items = [ - item for item in prior_items.values() - if item['id'] not in fetched_items] - for item in old_items: + fetched_ids = [item['id'] for item in updated_items] + old_item_ids = [ + item_id for item_id in prior_ids + if item_id not in fetched_ids] + for item_id in old_item_ids: + item = loader.load_item(source_name, item_id) remove = not item['active'] # The time-to-live field protects an item from removal until expiry. # This is mainly used to avoid old items resurfacing when their source @@ -170,35 +188,6 @@ def update_source(source_name, fetch_new): len(new_items), "s" if len(new_items) != 1 else "", del_count, "s" if del_count != 1 else "")) -def populate_new(source_name, item): - # id is required - if 'id' not in item: - raise Exception(f'Source "{source_name}" returned an item with no id') - # source is auto-populated with the source name if missing - # Note: this allows sources to create items in other cells! - if 'source' not in item: item['source'] = source_name - # active is forced to True for new items - item['active'] = True - # created is forced to the current timestamp - item['created'] = timestamp.now() - # title is auto-populated with the id if missing - if 'title' not in item: item['title'] = item['id'] - # tags is auto-populated if missing (not if empty!) - if 'tags' not in item: item['tags'] = [source_name] - # link, time, author, body, ttl, ttd, tts, callback are optional - -def populate_old(prior, new): - # Not updated: id, source, active, created - if 'title' in new: prior['title'] = new['title'] - if 'tags' in new: prior['tags'] = new['tags'] - if 'link' in new: prior['link'] = new['link'] - if 'time' in new: prior['time'] = new['time'] - if 'author' in new: prior['author'] = new['author'] - if 'body' in new: prior['body'] = new['body'] - if 'ttl' in new: prior['ttl'] = new['ttl'] - if 'ttd' in new: prior['ttd'] = new['ttd'] - if 'tts' in new: prior['tts'] = new['tts'] - if 'callback' in new: prior['callback'] = new['callback'] def item_callback(source_name, itemid): try: